In [1]:
# %% ONE-CELL: Core30 Monthly Matrix → parquet + manifest(items[]) + optional S3 upload
from pathlib import Path
import pandas as pd
import numpy as np
import yfinance as yf

# --- add for module path (nbconvert runs with cwd=analyze) ---
import sys
from pathlib import Path
ROOT = Path.cwd().resolve().parent  # project root
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
# --------------------------------------------------------------

# ==== 共通設定/ユーティリティ（DRY） ====
from common_cfg.env import load_dotenv_cascade
from common_cfg.flags import NO_MANIFEST, NO_S3
from common_cfg.paths import (
    PARQUET_DIR,
    OUT_ANOMALY,
    META_PARQUET,   # = core30_meta.parquet
    MANIFEST_PATH,
)
from common_cfg.s3cfg import DATA_BUCKET, PARQUET_PREFIX, AWS_REGION, AWS_PROFILE
from common_cfg.manifest import sha256_of, load_manifest_items, upsert_manifest_item, write_manifest
from common_cfg.s3io import maybe_upload_files_s3

# .env 読み込み
load_dotenv_cascade()

ANOMALY_KEY = OUT_ANOMALY.name  # "core30_anomaly.parquet"

# 入力チェック
if not META_PARQUET.exists():
    raise FileNotFoundError(f"not found: {META_PARQUET}")

meta = pd.read_parquet(META_PARQUET, engine="pyarrow")
if "ticker" not in meta.columns:
    raise ValueError("meta parquet must contain 'ticker'")

tickers = (
    meta["ticker"].dropna().astype("string").str.strip().unique().tolist()
)
if not tickers:
    raise RuntimeError("No tickers found in core30_meta.parquet")

# 月次データ取得 → 前月比(%)
def _download_monthly_close(ticker: str) -> pd.Series:
    try:
        df = yf.download(ticker, period="max", interval="1mo", progress=False, auto_adjust=False)
    except Exception:
        df = pd.DataFrame()

    if df is None or df.empty:
        return pd.Series(dtype=float, name=ticker)

    if isinstance(df.columns, pd.MultiIndex):
        if "Close" in df.columns.get_level_values(0):
            sub = df.xs("Close", axis=1, level=0)
        elif "Close" in df.columns.get_level_values(1):
            sub = df.xs("Close", axis=1, level=1)
        else:
            return pd.Series(dtype=float, name=ticker)
        s = sub[ticker] if ticker in getattr(sub, "columns", []) else sub.squeeze()
    else:
        s = df["Close"] if "Close" in df.columns else pd.Series(dtype=float, name=ticker)

    s = pd.to_numeric(pd.Series(s), errors="coerce").dropna()
    s.index = pd.to_datetime(s.index)
    s.name = ticker
    return s

rows = []
for i, tic in enumerate(tickers, 1):
    s_close = _download_monthly_close(tic)
    if s_close.empty:
        continue
    ret_m = (s_close.pct_change() * 100.0).dropna()
    if ret_m.empty:
        continue

    df = (
        ret_m.to_frame("return_pct")
             .assign(year=lambda d: d.index.year,
                     month=lambda d: d.index.month)
             [["year","month","return_pct"]]
             .astype({"year":"int64","month":"int64"})
    )
    df["ticker"] = tic
    rows.append(df[["ticker","year","month","return_pct"]])

    if i % 5 == 0:
        print(f"[{i}/{len(tickers)}] processed...")

if not rows:
    raise RuntimeError("No monthly returns computed.")

out = pd.concat(rows, ignore_index=True)

# 保存
OUT_ANOMALY.parent.mkdir(parents=True, exist_ok=True)
out.to_parquet(OUT_ANOMALY, engine="pyarrow", index=False)
print(f"[OK] saved: {OUT_ANOMALY.resolve()} rows={len(out)}")

# manifest upsert（抑止可能）
if not NO_MANIFEST:
    items = load_manifest_items(MANIFEST_PATH)
    items = upsert_manifest_item(items, ANOMALY_KEY, OUT_ANOMALY)  # ← hasher= を削除
    write_manifest(items, MANIFEST_PATH)  # ← write_manifest は (items, path)
    print(f"[OK] manifest updated: {MANIFEST_PATH}")
else:
    print("[INFO] PIPELINE_NO_MANIFEST=1 → manifest 更新はスキップ")

# S3（抑止可能）
_to_upload = [OUT_ANOMALY, MANIFEST_PATH] if not NO_MANIFEST else [OUT_ANOMALY]
maybe_upload_files_s3(_to_upload, bucket=DATA_BUCKET, prefix=PARQUET_PREFIX,
                      aws_region=AWS_REGION, aws_profile=AWS_PROFILE, dry_run=False)

# 先頭表示（確認用）
out.head()


[5/30] processed...


[10/30] processed...


[15/30] processed...


[20/30] processed...


[25/30] processed...


[30/30] processed...
[OK] saved: /Users/hiroyukiyamanaka/Desktop/python_stock/dash_plotly/data/parquet/core30_anomaly.parquet rows=8690
[INFO] PIPELINE_NO_MANIFEST=1 → manifest 更新はスキップ
[OK] uploaded: s3://dash-plotly/parquet/core30_anomaly.parquet


Unnamed: 0,ticker,year,month,return_pct
0,2914.T,2000,2,-11.599099
1,2914.T,2000,3,-6.369427
2,2914.T,2000,4,8.163265
3,2914.T,2000,5,3.773585
4,2914.T,2000,6,12.848485
