In [2]:
# Robust minimal fetch & save (works even if column names vary)
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import yfinance as yf

ROOT = Path.cwd()
DATA_RAW = ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)
STAMP = datetime.now().strftime("%Y%m%d-%H%M")
TICKER = "AAPL"

# 1) Download
df = yf.download(TICKER, period="6mo", interval="1d", auto_adjust=False, progress=False)
if df is None or len(df) == 0:
    raise RuntimeError("Empty download from yfinance (check network/ticker).")

# 2) Normalize columns
#    - handle possible MultiIndex
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [c[0] for c in df.columns]
df = df.reset_index()
df.columns = [str(c).strip().lower() for c in df.columns]

# 3) Find/rename a date-like column to "date"
date_candidates = ["date", "datetime", "index"]
date_col = next((c for c in date_candidates if c in df.columns), df.columns[0])
df = df.rename(columns={date_col: "date"})

# 4) Map alternative names to standard ones
rename_map = {}
if "adj close" in df.columns: rename_map["adj close"] = "adjusted_close"
if "adj_close" in df.columns: rename_map["adj_close"] = "adjusted_close"
if "adjusted close" in df.columns: rename_map["adjusted close"] = "adjusted_close"
df = df.rename(columns=rename_map)

# 5) Ensure all required columns exist; create if missing
required = ["date","open","high","low","close","adjusted_close","volume"]
for col in required:
    if col not in df.columns:
        df[col] = np.nan

# 6) Keep only required columns (now guaranteed to exist)
df = df[required].copy()

# 7) Coerce types safely
df["date"] = pd.to_datetime(df["date"], errors="coerce")
for col in ["open","high","low","close","adjusted_close","volume"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# 8) Save
outpath = DATA_RAW / f"api_yfinance_{TICKER}_{STAMP}.csv"
df.to_csv(str(outpath), index=False)
print("Saved:", outpath)

df.head()


Saved: c:\Users\24614\bootcamp_Mingxuan_Jiang\homework\homework4\stage04_case_starter\data\raw\api_yfinance_AAPL_20250816-1357.csv


Unnamed: 0,date,open,high,low,close,adjusted_close,volume
0,2025-02-18,244.149994,245.179993,241.839996,244.470001,243.873062,48822500
1,2025-02-19,244.660004,246.009995,243.160004,244.869995,244.272079,32204200
2,2025-02-20,244.940002,246.779999,244.289993,245.830002,245.229736,32316900
3,2025-02-21,245.949997,248.690002,245.220001,245.550003,244.950424,53197400
4,2025-02-24,244.929993,248.860001,244.419998,247.100006,246.496643,51326400


In [3]:
from pathlib import Path
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup

ROOT = Path.cwd()
DATA_RAW = ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)
STAMP = datetime.now().strftime("%Y%m%d-%H%M")

url = "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"
html = requests.get(url, timeout=30).text
soup = BeautifulSoup(html, "lxml")

candidate = None
for tbl in soup.select("table.wikitable"):
    headers = [th.get_text(strip=True).lower() for th in tbl.select("tr th")]
    if any("symbol" in h for h in headers) and any(("company" in h) or ("constituent" in h) or ("name" in h) for h in headers):
        candidate = tbl
        break
if candidate is None:
    candidate = soup.select_one("table.wikitable")

rows = []
headers = [th.get_text(strip=True) for th in candidate.select("tr th")]
for tr in candidate.select("tr")[1:]:
    cells = [td.get_text(strip=True) for td in tr.select("td")]
    if len(cells) < 2: 
        continue
    row = {}
    for i, val in enumerate(cells):
        col = headers[i] if i < len(headers) else f"col_{i}"
        row[col] = val
    rows.append(row)

df2 = pd.DataFrame(rows)
rename_map = {}
for col in df2.columns:
    low = col.lower()
    if "symbol" in low: rename_map[col] = "Symbol"
    if ("company" in low) or ("constituent" in low) or ("name" in low): rename_map[col] = "Company"
    if "weight" in low: rename_map[col] = "Weight"
df2 = df2.rename(columns=rename_map)

keep = [c for c in ["Symbol","Company","Weight"] if c in df2.columns]
if not keep:
    keep = df2.columns[:3]
df2 = df2[keep].copy()

outpath2 = DATA_RAW / f"scrape_wikipedia_djia_{STAMP}.csv"
df2.to_csv(str(outpath2), index=False)
print("Saved:", outpath2)

df2.head()


Saved: c:\Users\24614\bootcamp_Mingxuan_Jiang\homework\homework4\stage04_case_starter\data\raw\scrape_wikipedia_djia_20250816-1357.csv


Unnamed: 0,Symbol,Company
0,Conglomerate,NYSE
1,Financial services,NYSE
2,Biopharmaceutical,NASDAQ
3,Retailing,NASDAQ
4,Information technology,NASDAQ
