In [2]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 01 — Data Preprocessing\n",
    "Fetch ~200 equities via `yfinance`, compute daily log returns, save CSVs under `data/`."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import os, json, yaml\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import yfinance as yf\n",
    "from pathlib import Path\n",
    "\n",
    "PROJ = Path(\"..\")\n",
    "DATA_RAW = PROJ/\"data\"/\"raw\"\n",
    "DATA_PROC = PROJ/\"data\"/\"processed\"\n",
    "DATA_RAW.mkdir(parents=True, exist_ok=True)\n",
    "DATA_PROC.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "with open(PROJ/\"configs\"/\"params.yaml\", \"r\") as fh:\n",
    "    cfg = yaml.safe_load(fh)\n",
    "\n",
    "start = cfg['dates']['start']\n",
    "end = cfg['dates']['end']\n",
    "tickers = cfg['universe']['tickers']\n",
    "max_assets = cfg['windows']['max_assets']\n",
    "\n",
    "if not tickers:\n",
    "    # Fallback universe: liquid S&P 500 subset via yfinance's ticker list\n",
    "    # We use a static popular subset to avoid scraping complexities.\n",
    "    fallback = [\n",
    "        'AAPL','MSFT','NVDA','AMZN','META','GOOGL','BRK-B','LLY','JPM','XOM','UNH','AVGO','TSLA','V','WMT','JNJ','PG','MA','HD','MRK',\n",
    "        'PEP','CVX','ABBV','COST','BAC','KO','ADBE','ORCL','NFLX','ACN','TMO','CSCO','MCD','LIN','CRM','ABT','AMD','WFC','INTU','DHR',\n",
    "        'TXN','PM','AMAT','IBM','UNP','MS','HON','CAT','GE','AMGN','BKNG','LMT','QCOM','BA','SBUX','GS','LOW','BLK','RTX','SPGI',\n",
    "        'INTC','GILD','ISRG','MDT','ADP','SYK','DE','CB','NOW','ELV','PLD','TJX','ZTS','CME','SCHW','SO','MO','MU','PNC','CI','PGR',\n",
    "        'MMC','BK','USB','NKE','BDX','REGN','BSX','TGT','DUK','ICE','ADI','EQIX','HCA','ETN','CL','MDLZ','AON','APH','GD','FDX',\n",
    "        'EOG','C','LRCX','PH','SHW','ALL','AEP','FISV','KLAC','MCO','MAR','ITW','CSX','EW','AIG','AFL','TRV','COP','SLB','OXY',\n",
    "        'PFE','CVS','KMB','PSA','HPQ','ORLY','AZO','ALGN','CMG','MNST','ROST','CTAS','SNPS','CDNS','NXPI','MSI','ADM','TEL','PRU',\n",
    "        'HUM','A','TT','KMI','SRE','D','BMY','ILMN','TFC','AMP','HLT','RSG','DLR','VLO','KHC','VZ','T','APA','WBA','EA','ATVI',\n",
    "        'PAYX','PCAR','KDP','EBAY','WELL','NEM','LULU','NOC','AVB','ED','ECL','CPRT','MCHP','XEL','DXCM','ROP','STZ','ANSS','RMD',\n",
    "        'WBD','ODFL','AEE','EXC','PEG','DTE','HES','MTB','WEC','LUV','PPG','PPD','UPS','KR','GIS','HSY','MKC','DG','DLTR','LEN'\n",
    "    ]\n",
    "    tickers = fallback[:max_assets]\n",
    "\n",
    "print(f\"Universe size: {len(tickers)}\")\n",
    "px = yf.download(tickers, start=start, end=end, auto_adjust=True)['Close']\n",
    "px = px.dropna(how='all').dropna(axis=1, how='any')\n",
    "px.to_csv(DATA_RAW/\"prices.csv\")\n",
    "rets = np.log(px).diff().dropna()\n",
    "rets.to_csv(DATA_PROC/\"returns_log.csv\")\n",
    "print(px.shape, rets.shape)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
  "language_info": {"name": "python", "version": "3.x"}
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# 01 — Data Preprocessing\n',
    'Fetch ~200 equities via `yfinance`, compute daily log returns, save CSVs under `data/`.']},
  {'cell_type': 'code',
   'metadata': {},
   'source': ['import os, json, yaml\n',
    'import numpy as np\n',
    'import pandas as pd\n',
    'import yfinance as yf\n',
    'from pathlib import Path\n',
    '\n',
    'PROJ = Path("..")\n',
    'DATA_RAW = PROJ/"data"/"raw"\n',
    'DATA_PROC = PROJ/"data"/"processed"\n',
    'DATA_RAW.mkdir(parents=True, exist_ok=True)\n',
    'DATA_PROC.mkdir(parents=True, exist_ok=True)\n',
    '\n',
    'with open(PROJ/"configs"/"params.yaml", "r") as fh:\n',
    '    cfg = yaml.safe_load(fh)\n',
    '\n',
    "start = cfg['dates']['start']\n",
    "end = cfg['dates']['end']\n",
    "tickers = cfg['universe']['tickers']\n",
    "max_assets = cfg['windows']['max_assets']\n",
    '\n',
    'if not tickers:\n',
    "    # Fallback universe: liquid S&P 500