In [5]:
import os
import time
import pandas as pd
import numpy as np
import yfinance as yf
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# ================= CONFIG =================
OUTDIR = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data"
os.makedirs(OUTDIR, exist_ok=True)
SAVE_PATH = os.path.join(OUTDIR, "SP500_Consolidated.zip")
YEARS = 10
WORKERS = 10
# ==========================================

def get_sp500_tickers():
    """Fetch S&P500 tickers from Wikipedia using requests (avoids 403)."""
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    tables = pd.read_html(resp.text)
    tickers = tables[0]["Symbol"].tolist()
    mapping = {"BRK.B": "BRK-B", "BF.B": "BF-B"}
    tickers = [mapping.get(t, t) for t in tickers]
    return tickers

def fetch_one(ticker):
    """Fetch OHLCV + fundamentals for one ticker."""
    try:
        t = yf.Ticker(ticker)
        hist = t.history(period=f"{YEARS}y", auto_adjust=False)
        if hist.empty:
            return None

        hist = hist.rename(columns={
            "Open": "Open", "High": "High", "Low": "Low",
            "Close": "Close", "Volume": "Volume"
        })
        hist["Ticker"] = ticker

        # fundamentals
        info = t.info
        fundamentals = {
            "PE_TTM": info.get("trailingPE"),
            "PB_Ratio": info.get("priceToBook"),
            "PS_Ratio": info.get("priceToSalesTrailing12Months"),
            "EPS_TTM": info.get("trailingEps"),
            "DividendYield": info.get("dividendYield"),
            "ROE": info.get("returnOnEquity"),
            "SharesOutstanding": info.get("sharesOutstanding"),
        }
        for k, v in fundamentals.items():
            hist[k] = v

        # derived
        hist["MarketCap"] = hist["Close"] * hist["SharesOutstanding"]
        hist["TurnoverProxy"] = hist["Volume"] / hist["SharesOutstanding"]

        hist = hist.reset_index().rename(columns={"Date": "date"})
        hist = hist.set_index(["date", "Ticker"]).sort_index()
        return hist

    except Exception as e:
        print(f"Failed {ticker}: {e}")
        return None

def build_sp500_consolidated():
    tickers = get_sp500_tickers()
    print(f"Fetched {len(tickers)} tickers from Wikipedia.")

    results = []
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        futures = {executor.submit(fetch_one, t): t for t in tickers}
        for i, f in enumerate(as_completed(futures), 1):
            df = f.result()
            if df is not None:
                results.append(df)
            if i % 10 == 0:
                print(f"Completed {i}/{len(tickers)} tickers...")

    if not results:
        raise RuntimeError("No data fetched. Check yfinance connectivity or rate limits.")

    panel = pd.concat(results)
    panel = panel.groupby("Ticker").apply(lambda x: x.ffill().bfill()).reset_index(level=0, drop=True)
    panel = panel.sort_index()

    panel.to_csv(SAVE_PATH, compression="zip")
    print(f"\nSaved consolidated S&P500 dataset to:\n{SAVE_PATH}")
    return panel

# ================= RUN =================
if __name__ == "__main__":
    start = time.time()
    try:
        df = build_sp500_consolidated()
        print(f"Done in {(time.time()-start)/60:.2f} min")
    except Exception as e:
        print(f"Fatal error: {e}")


  tables = pd.read_html(resp.text)


Fetched 503 tickers from Wikipedia.
Completed 10/503 tickers...
Completed 20/503 tickers...
Completed 30/503 tickers...
Completed 40/503 tickers...
Completed 50/503 tickers...
Completed 60/503 tickers...
Completed 70/503 tickers...
Completed 80/503 tickers...
Completed 90/503 tickers...
Completed 100/503 tickers...
Completed 110/503 tickers...
Completed 120/503 tickers...
Completed 130/503 tickers...
Completed 140/503 tickers...
Completed 150/503 tickers...
Completed 160/503 tickers...
Completed 170/503 tickers...
Completed 180/503 tickers...
Completed 190/503 tickers...
Completed 200/503 tickers...
Completed 210/503 tickers...
Completed 220/503 tickers...
Completed 230/503 tickers...
Completed 240/503 tickers...
Completed 250/503 tickers...
Completed 260/503 tickers...
Completed 270/503 tickers...
Completed 280/503 tickers...
Completed 290/503 tickers...
Completed 300/503 tickers...
Completed 310/503 tickers...
Completed 320/503 tickers...
Completed 330/503 tickers...
Completed 340/50

  panel = pd.concat(results)



Saved consolidated S&P500 dataset to:
C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\SP500_Consolidated.zip
Done in 2.36 min


In [None]:
import zipfile
import pandas as pd

SAVE_PATH = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\SP500_Consolidated.zip"

# Look inside the ZIP to find the CSV name
with zipfile.ZipFile(SAVE_PATH, 'r') as z:
    print(z.namelist())  # list of files inside
    csv_name = z.namelist()[0]  # first (and probably only) file
    with z.open(csv_name) as f:
        df = pd.read_csv(f)

print("Shape of data:", df.shape)
print("Columns:", df.columns.tolist())  


['SP500_Consolidated']
Shape of data: (1231178, 19)
Columns: ['date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Dividends', 'Stock Splits', 'PE_TTM', 'PB_Ratio', 'PS_Ratio', 'EPS_TTM', 'DividendYield', 'ROE', 'SharesOutstanding', 'MarketCap', 'TurnoverProxy']
