In [3]:
# ==== CONFIG (edit) ====
TICKER = "AAPL"
RAW_DIR = "data/raw"

# ==== IMPORTS ====
import os, re, requests, pandas as pd
from pathlib import Path
from datetime import datetime
try:
    from dotenv import load_dotenv; load_dotenv()
except Exception:
    pass

# ==== FETCHERS ====
def fetch_alpha_vantage_daily(ticker: str) -> pd.DataFrame:
    key = os.getenv("ALPHAVANTAGE_API_KEY")
    if not key:
        raise RuntimeError("No ALPHAVANTAGE_API_KEY in .env")
    url = "https://www.alphavantage.co/query"
    params = {
        "function":"TIME_SERIES_DAILY_ADJUSTED",
        "symbol":ticker,
        "outputsize":"full",
        "apikey":key,
        "datatype":"json",
    }
    r = requests.get(url, params=params, timeout=30); r.raise_for_status()
    data = r.json()
    if "Note" in data or "Information" in data:
        raise RuntimeError(data.get("Note") or data.get("Information"))
    ts = data.get("Time Series (Daily)")
    if not ts:
        raise RuntimeError(f"Unexpected AV response keys: {list(data)[:6]}")
    df = (pd.DataFrame(ts).T
            .rename(columns={
                "1. open":"Open","2. high":"High","3. low":"Low","4. close":"Close",
                "5. adjusted close":"Adj Close","6. volume":"Volume"
            })
            .reset_index().rename(columns={"index":"Date"}))
    return df

def fetch_yfinance(ticker: str) -> pd.DataFrame:
    import pandas as pd
    import yfinance as yf
    df = yf.download(
        ticker,
        period="max",
        auto_adjust=False,
        progress=False,
        group_by="column",   # <- avoids ticker-first MultiIndex in most cases
    )
    if df is None or df.empty:
        raise RuntimeError("yfinance returned no data")

    # Flatten any MultiIndex reliably
    if isinstance(df.columns, pd.MultiIndex):
        # If columns look like (TICKER, Field), keep Field
        lvl1 = df.columns.get_level_values(0).unique().tolist()
        lvl2 = df.columns.get_level_values(-1).unique().tolist()
        if set(["Open","High","Low","Close","Adj Close","Volume"]).issubset(set(lvl2)):
            df.columns = df.columns.get_level_values(-1)
        else:
            # generic flatten: join with space
            df.columns = [" ".join([str(x) for x in tup if x not in (None,"")]).strip()
                          for tup in df.columns.to_list()]
    else:
        # some builds prefix columns like "AAPL Open"
        df.columns = [str(c) for c in df.columns]

    df = df.reset_index()
    return df

# ==== STANDARDIZE & VALIDATE (ultra-tolerant) ===
def standardize_and_validate(df: pd.DataFrame) -> pd.DataFrame:
    import re, numpy as np, pandas as pd

    # 1) force string cols & flatten if needed
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [" ".join([str(x) for x in tup if x not in (None, "")]).strip()
                      for tup in df.columns.to_list()]
    else:
        df.columns = [str(c) for c in df.columns]

    # 2) tolerant matcher
    def find_col(cols, *must_have, exclude=None):
        exclude = [e.lower() for e in (exclude or [])]
        for c in cols:
            lc = c.lower()
            if all(m in lc for m in [m.lower() for m in must_have]) and not any(e in lc for e in exclude):
                return c
        return None

    # locate candidates
    col_date  = find_col(df.columns, "date") or find_col(df.columns, "timestamp") or find_col(df.columns, "datetime")
    col_open  = find_col(df.columns, "open")
    col_high  = find_col(df.columns, "high")
    col_low   = find_col(df.columns, "low")
    col_adj   = find_col(df.columns, "adj", "close")
    # key fix: accept things like "Close AAPL", but exclude "Adj Close"
    col_close = find_col(df.columns, "close", exclude=["adj"])

    # fallback regex for CLOSE (handles weird spacing/prefix/suffix)
    if col_close is None:
        cands = [c for c in df.columns if re.search(r"\bclose\b", c, flags=re.I) and "adj" not in c.lower()]
        if cands:
            col_close = cands[0]

    col_vol   = find_col(df.columns, "volume") or find_col(df.columns, "vol")

    # 3) rename to canonical
    rename = {}
    if col_date:  rename[col_date]  = "Date"
    if col_open:  rename[col_open]  = "Open"
    if col_high:  rename[col_high]  = "High"
    if col_low:   rename[col_low]   = "Low"
    if col_close: rename[col_close] = "Close"
    if col_adj:   rename[col_adj]   = "Adj Close"
    if col_vol:   rename[col_vol]   = "Volume"
    df = df.rename(columns=rename)

    # 4) dtypes
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    for c in ["Open","High","Low","Close","Adj Close"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    if "Volume" in df.columns:
        v = pd.to_numeric(df["Volume"], errors="coerce")
        try: df["Volume"] = v.astype("Int64")
        except Exception: df["Volume"] = v

    # 5) validate
    required = ["Date","Open","High","Low","Close","Volume"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        print("DEBUG columns after rename:", list(df.columns))
        raise ValueError(f"Missing required columns: {missing}")
    if df.empty:
        raise ValueError("Empty DataFrame after fetch")
    na = df[required].isna().sum()
    if na.get("Date",0) > 0 or na.get("Close",0) > 0:
        raise ValueError(f"NAs in critical columns:\n{na}")

    # 6) order & sort
    ordered = [c for c in ["Date","Open","High","Low","Close","Adj Close","Volume"] if c in df.columns]
    df = df[ordered + [c for c in df.columns if c not in ordered]].sort_values("Date").reset_index(drop=True)
    return df


# ==== PIPELINE ====
def pull_and_save(ticker: str, raw_dir: str) -> str:
    Path(raw_dir).mkdir(parents=True, exist_ok=True)
    try:
        df = fetch_alpha_vantage_daily(ticker); src = "alphavantage"
    except Exception as e:
        print(f"Alpha Vantage failed → {e}\nFalling back to yfinance.")
        df = fetch_yfinance(ticker); src = "yfinance"

    df = standardize_and_validate(df)

    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    out = Path(raw_dir)/f"{ticker.upper()}_{src}_{ts}.csv"
    df.to_csv(out, index=False)
    print(f"✅ Saved {len(df):,} rows → {out}")
    return str(out)


In [4]:
csv_path = pull_and_save(TICKER, RAW_DIR)
csv_path


Alpha Vantage failed → Thank you for using Alpha Vantage! This is a premium endpoint. You may subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly unlock all premium endpoints
Falling back to yfinance.
✅ Saved 11,265 rows → data/raw/AAPL_yfinance_20250824_004133.csv


'data/raw/AAPL_yfinance_20250824_004133.csv'

STEP 2 : Scrape a Small Table