In [1]:
import sys, pathlib, os
from datetime import datetime
ROOT = pathlib.Path.cwd().parent
sys.path.insert(0, str(ROOT))

from src.config import load_env, get
load_env()

RAW_DIR = ROOT / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

def ts(): return datetime.now().strftime("%Y%m%d-%H%M")
def save_csv(df, path):
    df.to_csv(path, index=False)
    print("Saved:", path)


In [3]:
import pandas as pd, requests, yfinance as yf

TICKER = "AAPL"
AV_KEY = get("ALPHAVANTAGE_API_KEY")

def ingest_api_finance(ticker: str) -> pd.DataFrame:
    if AV_KEY:
        url = "https://www.alphavantage.co/query"
        params = {"function":"TIME_SERIES_DAILY_ADJUSTED","symbol":ticker,"apikey":AV_KEY,"outputsize":"compact"}
        js = requests.get(url, params=params, timeout=30).json()
        ts_key = "Time Series (Daily)"
        assert ts_key in js, f"Unexpected payload keys: {list(js)[:5]}"
        recs = []
        for dt, row in js[ts_key].items():
            recs.append({
                "date": pd.to_datetime(dt),
                "open": float(row["1. open"]), "high": float(row["2. high"]),
                "low": float(row["3. low"]), "close": float(row["4. close"]),
                "adj_close": float(row.get("5. adjusted close", row["4. close"])),
                "volume": int(row["6. volume"]), "ticker": ticker
            })
        df = pd.DataFrame(recs).sort_values("date").reset_index(drop=True)
    else:
        df = yf.download(ticker, period="6mo", interval="1d", auto_adjust=False).reset_index()
        df = df.rename(columns={"Date":"date","Open":"open","High":"high","Low":"low",
                                "Close":"close","Adj Close":"adj_close","Volume":"volume"})
        df["ticker"] = ticker
    return df

df_api = ingest_api_finance(TICKER)
df_api.head()


[*********************100%***********************]  1 of 1 completed


Price,date,adj_close,close,high,low,open,volume,ticker
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 8_level_1
0,2025-02-18,243.873062,244.470001,245.179993,241.839996,244.149994,48822500,AAPL
1,2025-02-19,244.272079,244.869995,246.009995,243.160004,244.660004,32204200,AAPL
2,2025-02-20,245.229752,245.830002,246.779999,244.289993,244.940002,32316900,AAPL
3,2025-02-21,244.950424,245.550003,248.690002,245.220001,245.949997,53197400,AAPL
4,2025-02-24,246.496643,247.100006,248.860001,244.419998,244.929993,51326400,AAPL


In [7]:
import pandas as pd

num_cols = ["open","high","low","close","adj_close","volume"]

# 1) Coerce numeric-like columns
df_api[num_cols] = df_api[num_cols].apply(pd.to_numeric, errors="coerce")

# 2) Drop rows that are invalid after coercion (safer than filling)
req = ["date","open","high","low","close","adj_close","volume","ticker"]
df_api = df_api.dropna(subset=req)

# 3) Enforce dtypes (volume as int)
if not pd.api.types.is_integer_dtype(df_api["volume"]):
    df_api["volume"] = df_api["volume"].astype("int64")

# 4) Re-run validations
missing = [c for c in req if c not in df_api.columns]
assert not missing, f"Missing columns: {missing}"
assert pd.api.types.is_datetime64_any_dtype(df_api["date"]), "date must be datetime"
for c in ["open","high","low","close","adj_close"]:
    assert pd.api.types.is_numeric_dtype(df_api[c]), f"{c} must be numeric"
assert pd.api.types.is_integer_dtype(df_api["volume"]), "volume must be int"
assert df_api["ticker"].nunique() == 1, "expected a single ticker"
assert df_api[req].isna().sum().sum() == 0, "no NAs allowed in required columns"

# 5) Save
api_path = RAW_DIR / (f"api_alphavantage_{TICKER}_{ts()}.csv" if AV_KEY else f"api_yahoo_{TICKER}_{ts()}.csv")
save_csv(df_api, api_path)
print("Saved:", api_path)
print(df_api.dtypes)
print(df_api[num_cols].head())
print(df_api[num_cols].isna().sum())


KeyError: ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume', 'ticker']

In [None]:
from bs4 import BeautifulSoup
import lxml, requests

URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
html = requests.get(URL, timeout=30).text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"id":"constituents"}) or soup.find("table", {"class":"wikitable"})
rows = []
for tr in table.select("tr")[1:]:
    tds = [td.get_text(strip=True) for td in tr.select("td")]
    if len(tds) >= 2:
        rows.append({"symbol": tds[0], "security": tds[1]})
df_spx = pd.DataFrame(rows)
df_spx.head()


In [None]:
assert not df_spx.empty
for c in ["symbol","security"]:
    assert c in df_spx.columns and df_spx[c].dtype == "object"
assert df_spx[["symbol","security"]].isna().sum().sum() == 0

scrape_path = RAW_DIR / f"scrape_wikipedia_sp500_{ts()}.csv"
save_csv(df_spx, scrape_path)


### Sources & Parameters
- Finance API: Alpha Vantage (if key present) or yfinance for AAPL.
- Scrape: Wikipedia S&P 500 constituents.

**Validation**
- API: required columns, dtypes, no NAs, single ticker.
- Scrape: required text columns present, no NAs.

**Assumptions/Risks**
- Wiki table structure may change; API may throttle; adjusted-close semantics differ by source.
