In [2]:
# === TOPIX Core 30 の株価(1y,1d)を取得して Parquet に保存 + manifest更新 + S3アップロード ===
from pathlib import Path
import pandas as pd
import numpy as np
import yfinance as yf

# --- add for module path (nbconvert runs with cwd=analyze) ---
import sys
from pathlib import Path
ROOT = Path.cwd().resolve().parent  # project root
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
# --------------------------------------------------------------

# ==== 共通設定/ユーティリティ（DRY） ====
from common_cfg.env import load_dotenv_cascade
from common_cfg.flags import NO_MANIFEST, NO_S3
from common_cfg.paths import (
    PARQUET_DIR,
    WEIGHT_PARQUET,
    OUT_PRICES,
    OUT_META,
    MANIFEST_PATH,
)
from common_cfg.s3cfg import DATA_BUCKET, PARQUET_PREFIX, AWS_REGION, AWS_PROFILE
from common_cfg.manifest import sha256_of, write_manifest_atomic
from common_cfg.s3io import maybe_upload_files_s3

# .env 読み込み（.env.s3 → .env の順で存在すれば読み込み）
load_dotenv_cascade()

# ---- 読み込み（Core30 抽出）----
if not WEIGHT_PARQUET.exists():
    raise FileNotFoundError(f"not found: {WEIGHT_PARQUET}")

w = pd.read_parquet(WEIGHT_PARQUET, engine="pyarrow")
for col in ("code", "stock_name", "size_class"):
    if col not in w.columns:
        raise KeyError(f"required column missing: {col}")

w["code"] = w["code"].astype("string")
w["size_class"] = w["size_class"].astype("string")

# "TOPIX Core30/Core 30" 両対応（空白除去して包含判定）
_mask_core30 = w["size_class"].str.replace(" ", "", regex=False).str.contains("Core30", case=False, na=False)
core = (
    w.loc[_mask_core30, ["code", "stock_name"]]
     .drop_duplicates(subset=["code"])
     .reset_index(drop=True)
)

def _to_ticker(x: str) -> str:
    s = str(x).strip()
    return s if s.endswith(".T") else f"{s}.T"

core["ticker"] = core["code"].map(_to_ticker)

if core.empty:
    raise RuntimeError("Core30 list is empty. Check 'size_class' values in topixweight_j.parquet.")

tickers = core["ticker"].tolist()

# ---- yfinance 取得（1y,1d）----
def _flatten_multi(raw: pd.DataFrame, tickers: list[str]) -> pd.DataFrame:
    frames = []
    if isinstance(raw.columns, pd.MultiIndex):
        for t in tickers:
            if t in raw.columns.get_level_values(0):
                sub = raw[t].copy()
                if sub.empty:
                    continue
                sub = sub.reset_index()
                if "Date" in sub.columns:
                    sub = sub.rename(columns={"Date": "date"})
                elif "index" in sub.columns:
                    sub = sub.rename(columns={"index": "date"})
                else:
                    sub.columns = ["date"] + [c for c in sub.columns[1:]]
                sub["ticker"] = t
                keep = [c for c in ["date","Open","High","Low","Close","Volume","ticker"] if c in sub.columns]
                frames.append(sub[keep])
    else:
        sub = raw.reset_index()
        if "Date" in sub.columns:
            sub = sub.rename(columns={"Date": "date"})
        elif "index" in sub.columns:
            sub = sub.rename(columns={"index": "date"})
        sub["ticker"] = tickers[0] if tickers else "UNKNOWN"
        keep = [c for c in ["date","Open","High","Low","Close","Volume","ticker"] if c in sub.columns]
        frames.append(sub[keep])

    if not frames:
        return pd.DataFrame(columns=["date","Open","High","Low","Close","Volume","ticker"])
    out = pd.concat(frames, ignore_index=True)
    # tz-aware → naive への安全化
    if np.issubdtype(out["date"].dtype, np.datetime64):
        try:
            out["date"] = pd.to_datetime(out["date"]).dt.tz_localize(None)
        except Exception:
            out["date"] = pd.to_datetime(out["date"], utc=True).dt.tz_localize(None)
    else:
        out["date"] = pd.to_datetime(out["date"], errors="coerce")
    for c in ["Open","High","Low","Close","Volume"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")
    return out

try:
    raw = yf.download(
        tickers,
        period="1y",
        interval="1d",
        group_by="ticker",
        threads=True,
        progress=False,
        auto_adjust=True, 
    )
    prices = _flatten_multi(raw, tickers)
    if prices.empty:
        raise RuntimeError("yf.download returned empty. fallback to per-ticker.")
except Exception:
    frames = []
    for t in tickers:
        try:
            r = yf.download(t, period="1y", interval="1d", group_by="ticker", threads=True, progress=False,auto_adjust=True)
            f = _flatten_multi(r, [t])
            if not f.empty:
                frames.append(f)
        except Exception:
            pass
    prices = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

need = {"date","Open","High","Low","Close","ticker"}
if prices.empty or not need.issubset(prices.columns):
    raise RuntimeError("No price data collected or required columns missing.")

# ---- 保存 ----
OUT_PRICES.parent.mkdir(parents=True, exist_ok=True)
core.to_parquet(OUT_META, engine="pyarrow", index=False)
prices.to_parquet(OUT_PRICES, engine="pyarrow", index=False)

print(f"[OK] meta  saved: {OUT_META}   rows={len(core)}")
print(f"[OK] prices saved: {OUT_PRICES} rows={len(prices)}")

# ---- manifest 更新（抑止可能） ----
if not NO_MANIFEST:
    items = []
    for p in [OUT_META, OUT_PRICES]:
        stat = p.stat()
        items.append({
            "key": p.name,
            "bytes": stat.st_size,
            "sha256": sha256_of(p),
            "mtime": pd.Timestamp(stat.st_mtime, unit="s", tz="UTC").isoformat(),
        })
    write_manifest_atomic(items, MANIFEST_PATH)
    print(f"[OK] manifest updated: {MANIFEST_PATH}")
else:
    print("[INFO] PIPELINE_NO_MANIFEST=1 → manifest 更新はスキップ")

# ---- S3 アップロード（抑止可能）----
_to_upload = [OUT_META, OUT_PRICES, MANIFEST_PATH] if not NO_MANIFEST else [OUT_META, OUT_PRICES]
maybe_upload_files_s3(_to_upload, bucket=DATA_BUCKET, prefix=PARQUET_PREFIX,
                      aws_region=AWS_REGION, aws_profile=AWS_PROFILE, dry_run=False)


[OK] meta  saved: /Users/hiroyukiyamanaka/Desktop/python_stock/dash_plotly/data/parquet/core30_meta.parquet   rows=30
[OK] prices saved: /Users/hiroyukiyamanaka/Desktop/python_stock/dash_plotly/data/parquet/core30_prices_1y_1d.parquet rows=7320
[OK] manifest updated: /Users/hiroyukiyamanaka/Desktop/python_stock/dash_plotly/data/parquet/manifest.json
[OK] uploaded: s3://dash-plotly/parquet/core30_meta.parquet
[OK] uploaded: s3://dash-plotly/parquet/core30_prices_1y_1d.parquet
[OK] uploaded: s3://dash-plotly/parquet/manifest.json
