# Stage 04 — Data Acquisition & Ingestion (Classroom Case, FIXED)

This notebook includes the patched `validate_df` that avoids the `TypeError: arg must be a list`.
Run cells from top to bottom.


In [1]:
# --- Setup & Utilities (English-only comments) ---
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Paths and timestamp
ROOT = Path.cwd()
DATA_RAW = ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)
STAMP = datetime.now().strftime("%Y%m%d-%H%M")

def _ensure_list(x):
    """Normalize a single string to a list, keep lists/tuples as-is, ignore None."""
    if x is None:
        return []
    if isinstance(x, (list, tuple)):
        return list(x)
    return [x]

def validate_df(
    df: pd.DataFrame,
    required_cols=None,
    numeric_cols=None,
    date_cols=None,
    min_rows: int = 1
):
    """Minimal validation for required columns, types, NA counts, and row count."""
    assert isinstance(df, pd.DataFrame), "Input is not a DataFrame"
    assert len(df) >= min_rows, f"Not enough rows: {len(df)}"

    required_cols = _ensure_list(required_cols)
    numeric_cols  = _ensure_list(numeric_cols)
    date_cols     = _ensure_list(date_cols)

    if required_cols:
        missing = [c for c in required_cols if c not in df.columns]
        assert not missing, f"Missing columns: {missing}"

    for c in date_cols:
        if c in df.columns and not pd.api.types.is_datetime64_any_dtype(df[c]):
            df[c] = pd.to_datetime(df[c], errors="coerce", utc=False)

    for c in numeric_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    na_counts = df.isna().sum()
    if na_counts.any():
        print("NA counts (non-zero only):")
        print(na_counts[na_counts > 0].sort_values(ascending=False))
    return df


In [2]:
# --- Part A: API-like fetch via yfinance (no key required) ---
import yfinance as yf

TICKER = "AAPL"  # You may change this to any valid ticker

def fetch_market_data(ticker: str) -> pd.DataFrame:
    # Download last 6 months of daily data
    df = yf.download(ticker, period="6mo", interval="1d", auto_adjust=False, progress=False)
    df = df.reset_index().rename(columns=str.lower)  # 'Date' -> 'date', etc.
    # Ensure adjusted_close exists even if not provided
    if "adj close" in df.columns:
        df = df.rename(columns={"adj close": "adjusted_close"})
    elif "adjusted_close" not in df.columns:
        import numpy as np
        df["adjusted_close"] = np.nan

    cols = ["date", "open", "high", "low", "close", "adjusted_close", "volume"]
    df = df[cols]

    df = validate_df(
        df,
        required_cols=["date", "open", "high", "low", "close", "volume"],
        numeric_cols=["open", "high", "low", "close", "adjusted_close", "volume"],
        date_cols=["date"],
        min_rows=5
    ).sort_values("date").reset_index(drop=True)

    outpath = DATA_RAW / f"api_yfinance_{ticker.upper()}_{STAMP}.csv"
    df.to_csv(outpath, index=False)
    print(f"Saved: {outpath}")
    return df

api_df = fetch_market_data(TICKER)
api_df.head()


TypeError: arg must be a list, tuple, 1-d array, or Series

In [None]:
# --- Part B: Scrape a small public table (Wikipedia: DJIA constituents) ---
WIKI_URL = "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"

def fetch_djia_table(url: str = WIKI_URL) -> pd.DataFrame:
    html = requests.get(url, timeout=30).text
    soup = BeautifulSoup(html, "lxml")

    candidate = None
    for tbl in soup.select("table.wikitable"):
        headers = [th.get_text(strip=True).lower() for th in tbl.select("tr th")]
        if any("symbol" in h for h in headers) and any(("company" in h) or ("constituent" in h) or ("name" in h) for h in headers):
            candidate = tbl
            break
    if candidate is None:
        candidate = soup.select_one("table.wikitable")

    rows = []
    headers = [th.get_text(strip=True) for th in candidate.select("tr th")]
    for tr in candidate.select("tr")[1:]:
        cells = [td.get_text(strip=True) for td in tr.select("td")]
        if not cells or len(cells) < 2:
            continue
        row = {}
        for i, val in enumerate(cells):
            col = headers[i] if i < len(headers) else f"col_{i}"
            row[col] = val
        rows.append(row)

    df = pd.DataFrame(rows)

    rename_map = {}
    for col in df.columns:
        low = col.lower()
        if "symbol" in low: rename_map[col] = "Symbol"
        if ("company" in low) or ("constituent" in low) or ("name" in low): rename_map[col] = "Company"
        if "weight" in low: rename_map[col] = "Weight"
    df = df.rename(columns=rename_map)

    keep = [c for c in ["Symbol", "Company", "Weight"] if c in df.columns]
    if not keep:
        keep = df.columns[:3]
    df = df[keep].copy()

    numeric_cols = [c for c in df.columns if c.lower() in {"weight"}]
    df = validate_df(df, required_cols=[keep[0], keep[1]], numeric_cols=numeric_cols, min_rows=10)

    outpath = DATA_RAW / f"scrape_wikipedia_djia_{STAMP}.csv"
    df.to_csv(outpath, index=False)
    print(f"Saved: {outpath}")
    return df

scrape_df = fetch_djia_table()
scrape_df.head()


## Documentation

**Sources**
- API-like source: `yfinance` for daily OHLCV data. No API key required.
- Scraped table: Wikipedia page "Dow Jones Industrial Average".

**Parameters**
- Market data: `period="6mo"`, `interval="1d"`, `auto_adjust=False`.
- Scraping: first `wikitable` that contains headers similar to "Symbol" and "Company".

**Validation**
- Market data must contain `date, open, high, low, close, volume`.
- Convert date to datetime and price/volume to numeric.
- Minimum row count: 5.
- Scraped table must contain at least two descriptive columns (`Symbol`, `Company`) and at least 10 rows.

**File Naming & Location**
- Save raw CSVs under `data/raw/` with a timestamp.
  - `api_yfinance_<TICKER>_<YYYYMMDD-HHMM>.csv`
  - `scrape_wikipedia_djia_<YYYYMMDD-HHMM>.csv`

**Assumptions & Risks**
- Wikipedia layout may change; parsing uses flexible selectors.
- Free data sources can have occasional delays or missing fields.
- Market holidays can create gaps in dates.
