In [7]:
# %% [markdown]
# # Core30 Monthly Matrix Source (max_1mo)
# - 入力: ./data/parquet/core30_meta.parquet（{ticker} 必須）
# - 取得: yfinance(period="max", interval="1mo") の月次終値
# - 出力: ./data/parquet/core30_anomaly.parquet
# - スキーマ: ["ticker","year","month","return_pct"]
#   * return_pct は 前月比(%)、年×月ヒートマップの元データ
# - 追加: .env.s3 or .env を自動ロードして、生成後に S3 へアップロード（任意）

from pathlib import Path
import os, json, hashlib
from datetime import datetime, timezone
import pandas as pd
import numpy as np
import yfinance as yf

# ---- .env.s3 / .env の自動ロード ----
try:
    from dotenv import load_dotenv
    for p in (Path(".env.s3"), Path(".env")):
        if p.exists():
            load_dotenv(dotenv_path=p, override=False)
except Exception:
    # python-dotenv が未インストールでも続行（Docker では requirements に入れるのが推奨）
    pass

# ---- 入出力 ----
META_PARQUET = Path("./data/parquet/core30_meta.parquet")
OUT_PARQUET  = Path("./data/parquet/core30_anomaly.parquet")
MANIFEST     = Path("./data/parquet/manifest.json")

# ---- S3 設定（.env.s3 / .env から）----
DATA_BUCKET  = os.getenv("DATA_BUCKET")              # 例: dash-plotly
PARQUET_KEY  = os.getenv("CORE30_ANOMALY_KEY")       # 例: parquet/core30_anomaly.parquet
AWS_REGION   = os.getenv("AWS_REGION")               # 例: ap-northeast-1
AWS_PROFILE  = os.getenv("AWS_PROFILE")              # 例: default（ホストの ~/.aws を使う場合）

def _sha256sum(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

def _s3_client():
    import boto3
    if AWS_PROFILE:
        session = boto3.Session(profile_name=AWS_PROFILE, region_name=AWS_REGION)
    else:
        session = boto3.Session(region_name=AWS_REGION)
    return session.client("s3")

def upload_to_s3(local_path: Path, bucket: str, key: str) -> None:
    """存在すればアップロード。ACL不要・SSEは任意で変更可。"""
    if not bucket or not key:
        print("[INFO] DATA_BUCKET または S3 Key が未設定のためアップロードをスキップします。")
        return
    import mimetypes
    ctype, _ = mimetypes.guess_type(str(local_path))
    extra = {"ContentType": ctype or "application/octet-stream"}
    # 必要ならサーバーサイド暗号化: extra["ServerSideEncryption"] = "AES256"
    s3 = _s3_client()
    s3.upload_file(str(local_path), bucket, key, ExtraArgs=extra)
    print(f"[OK] uploaded to s3://{bucket}/{key}")

def update_manifest(manifest_path: Path, item_key: str, local_path: Path) -> None:
    """manifest.json を upsert（キーは S3 Key を推奨）。"""
    manifest = {}
    if manifest_path.exists():
        try:
            manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
        except Exception:
            manifest = {}

    manifest[item_key] = {
        "sha256": _sha256sum(local_path),
        "size": local_path.stat().st_size,
        "last_modified": datetime.now(timezone.utc).isoformat(),
        "local_path": str(local_path),
    }
    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"[OK] manifest updated: {manifest_path}")

# ---- メタ読み込み ----
if not META_PARQUET.exists():
    raise FileNotFoundError(f"not found: {META_PARQUET}")

meta = pd.read_parquet(META_PARQUET, engine="pyarrow")
if "ticker" not in meta.columns:
    raise ValueError("meta parquet must contain 'ticker'")

tickers = (
    meta["ticker"].dropna().astype("string").str.strip().unique().tolist()
)
if not tickers:
    raise RuntimeError("No tickers found in core30_meta.parquet")

len(tickers), tickers[:5]


(30, ['2914.T', '3382.T', '4063.T', '4502.T', '4568.T'])

In [8]:
# %% Build year-month matrix source from max_1mo
def _download_monthly_close(ticker: str) -> pd.Series:
    """
    yfinance(period="max", interval="1mo") の Close を返す。
    Index: Timestamp（月次）/ 値: float
    """
    try:
        df = yf.download(ticker, period="max", interval="1mo", progress=False, auto_adjust=False)
    except Exception:
        df = pd.DataFrame()

    if df is None or df.empty:
        return pd.Series(dtype=float, name=ticker)

    if not isinstance(df.columns, pd.MultiIndex):
        if "Close" in df.columns:
            s = df["Close"]
        else:
            return pd.Series(dtype=float, name=ticker)
    else:
        if ("Close", ticker) in df.columns:
            s = df[("Close", ticker)]
        else:
            if "Close" in df.columns.get_level_values(0):
                s = df.xs("Close", axis=1, level=0)
            elif "Close" in df.columns.get_level_values(1):
                s = df.xs("Close", axis=1, level=1)
            else:
                return pd.Series(dtype=float, name=ticker)
            if isinstance(s, pd.DataFrame):
                s = s[ticker] if ticker in s.columns else s.iloc[:, 0]

    s = pd.to_numeric(s.squeeze(), errors="coerce").dropna()
    s.index = pd.to_datetime(s.index)  # 月末寄せのタイムスタンプ
    s.name = ticker
    return s

rows = []
for i, tic in enumerate(tickers, 1):
    s_close = _download_monthly_close(tic)
    if s_close.empty:
        continue

    # 月次リターン（%）
    ret_m = s_close.pct_change() * 100.0
    ret_m = ret_m.dropna()
    if ret_m.empty:
        continue

    df = (
        ret_m.to_frame("return_pct")
             .assign(year=lambda d: d.index.year,
                     month=lambda d: d.index.month)
             [["year","month","return_pct"]]
             .astype({"year":"int64","month":"int64"})
    )
    df["ticker"] = tic
    rows.append(df[["ticker","year","month","return_pct"]])

    if i % 5 == 0:
        print(f"[{i}/{len(tickers)}] processed...")

if not rows:
    raise RuntimeError("No monthly returns computed.")

out = pd.concat(rows, ignore_index=True)

# ---- 保存 → manifest 更新 → S3 アップロード（任意）----
OUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)
out.to_parquet(OUT_PARQUET, engine="pyarrow", index=False)
print(f"[OK] saved: {OUT_PARQUET.resolve()} rows={len(out)}")

# manifest.json を upsert（キーは S3 の Key を使う）
manifest_key = PARQUET_KEY or "parquet/core30_anomaly.parquet"
update_manifest(MANIFEST, manifest_key, OUT_PARQUET)

# S3 へアップロード（環境変数があれば実行）
if DATA_BUCKET and PARQUET_KEY:
    try:
        upload_to_s3(OUT_PARQUET, DATA_BUCKET, PARQUET_KEY)      # 本体
        upload_to_s3(MANIFEST,  DATA_BUCKET, "parquet/manifest.json")  # manifest も一緒に
    except Exception as e:
        print(f"[WARN] S3 アップロードに失敗しました: {e}")
else:
    print("[INFO] DATA_BUCKET or CORE30_ANOMALY_KEY 未設定のため S3 アップロードをスキップします。")

out.head()



[5/30] processed...
[10/30] processed...
[15/30] processed...
[20/30] processed...
[25/30] processed...
[30/30] processed...
[OK] saved: /Users/hiroyukiyamanaka/Desktop/python_stock/dash_plotly/data/parquet/core30_anomaly.parquet rows=8690
[OK] manifest updated: data/parquet/manifest.json
[OK] uploaded to s3://dash-plotly/parquet/core30_anomaly.parquet
[OK] uploaded to s3://dash-plotly/parquet/manifest.json


Unnamed: 0,ticker,year,month,return_pct
0,2914.T,2000,2,-11.599099
1,2914.T,2000,3,-6.369427
2,2914.T,2000,4,8.163265
3,2914.T,2000,5,3.773585
4,2914.T,2000,6,12.848485
