In [5]:
# %% [markdown]
# # Core30 Monthly Matrix Source (max_1mo)
# - 入力: ./data/parquet/core30_meta.parquet（{ticker} 必須）
# - 取得: yfinance(period="max", interval="1mo") の月次終値
# - 出力: ./data/parquet/core30_anomaly.parquet
# - スキーマ: ["ticker","year","month","return_pct"]
#   * return_pct は 前月比(%)、年×月ヒートマップの元データ

from pathlib import Path
import pandas as pd
import numpy as np
import yfinance as yf

META_PARQUET = Path("./data/parquet/core30_meta.parquet")
OUT_PARQUET  = Path("./data/parquet/core30_anomaly.parquet")

if not META_PARQUET.exists():
    raise FileNotFoundError(f"not found: {META_PARQUET}")

meta = pd.read_parquet(META_PARQUET, engine="pyarrow")
if "ticker" not in meta.columns:
    raise ValueError("meta parquet must contain 'ticker'")

tickers = (
    meta["ticker"].dropna().astype("string").str.strip().unique().tolist()
)
if not tickers:
    raise RuntimeError("No tickers found in core30_meta.parquet")

len(tickers), tickers[:5]


(30, ['2914.T', '3382.T', '4063.T', '4502.T', '4568.T'])

In [6]:
# %% Build year-month matrix source from max_1mo
def _download_monthly_close(ticker: str) -> pd.Series:
    """
    yfinance(period="max", interval="1mo") の Close を返す。
    Index: Timestamp（月次）/ 値: float
    """
    try:
        df = yf.download(ticker, period="max", interval="1mo", progress=False, auto_adjust=False)
    except Exception:
        df = pd.DataFrame()

    if df is None or df.empty:
        return pd.Series(dtype=float, name=ticker)

    if not isinstance(df.columns, pd.MultiIndex):
        if "Close" in df.columns:
            s = df["Close"]
        else:
            return pd.Series(dtype=float, name=ticker)
    else:
        if ("Close", ticker) in df.columns:
            s = df[("Close", ticker)]
        else:
            if "Close" in df.columns.get_level_values(0):
                s = df.xs("Close", axis=1, level=0)
            elif "Close" in df.columns.get_level_values(1):
                s = df.xs("Close", axis=1, level=1)
            else:
                return pd.Series(dtype=float, name=ticker)
            if isinstance(s, pd.DataFrame):
                s = s[ticker] if ticker in s.columns else s.iloc[:, 0]

    s = pd.to_numeric(s.squeeze(), errors="coerce").dropna()
    s.index = pd.to_datetime(s.index)  # 月末寄せのタイムスタンプ
    s.name = ticker
    return s

rows = []
for i, tic in enumerate(tickers, 1):
    s_close = _download_monthly_close(tic)
    if s_close.empty:
        continue

    # 月次リターン（%）
    ret_m = s_close.pct_change() * 100.0
    ret_m = ret_m.dropna()
    if ret_m.empty:
        continue

    df = (
        ret_m.to_frame("return_pct")
             .assign(year=lambda d: d.index.year,
                     month=lambda d: d.index.month)
             [["year","month","return_pct"]]
             .astype({"year":"int64","month":"int64"})
    )
    df["ticker"] = tic
    rows.append(df[["ticker","year","month","return_pct"]])

    if i % 5 == 0:
        print(f"[{i}/{len(tickers)}] processed...")

if not rows:
    raise RuntimeError("No monthly returns computed.")

out = pd.concat(rows, ignore_index=True)
OUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)
out.to_parquet(OUT_PARQUET, engine="pyarrow", index=False)

print(f"[OK] saved: {OUT_PARQUET.resolve()} rows={len(out)}")
out.head()



[5/30] processed...
[10/30] processed...
[15/30] processed...
[20/30] processed...
[25/30] processed...
[30/30] processed...
[OK] saved: /Users/hiroyukiyamanaka/Desktop/python_stock/dash_plotly/data/parquet/core30_anomaly.parquet rows=8690


Unnamed: 0,ticker,year,month,return_pct
0,2914.T,2000,2,-11.599099
1,2914.T,2000,3,-6.369427
2,2914.T,2000,4,8.163265
3,2914.T,2000,5,3.773585
4,2914.T,2000,6,12.848485
