# 01 — Raw price data (WTI / XLE / ICLN)

Data source: Yahoo Finance (daily close prices (Adj Close when available; otherwise Close)), 2018–2024.

Oil proxy: **CL=F** (front-month WTI futures). This is a futures series (roll effects, term structure), not a spot price.

Output: a single aligned price panel saved for Notebook 02 (returns + features).

In [1]:
# Imports

import yfinance as yf
import pandas as pd
from pathlib import Path

pd.set_option("display.float_format", lambda x: f"{x:.6f}")

In [2]:
# Project settings

start_date = "2018-01-01"
end_date_exclusive = "2025-01-01"  # end is exclusive in yfinance; includes 2024-12-31

tickers = ["CL=F", "XLE", "ICLN"]

data_dir = Path("../data")
data_dir.mkdir(parents=True, exist_ok=True)

prices_path = data_dir / "prices_2018_2024.parquet"

In [3]:
# Helper

def download_adj_close(ticker: str) -> pd.Series:
    df = yf.download(
        ticker,
        start=start_date,
        end=end_date_exclusive,
        auto_adjust=False,
        progress=False,
    )

    if df.empty:
        raise ValueError(f"No data returned for {ticker}")

    col = "Adj Close" if "Adj Close" in df.columns else "Close"
    if col not in df.columns:
        raise ValueError(f"No usable price column for {ticker}")

    s = df[col].copy()
    s.name = ticker
    s.index = pd.to_datetime(s.index)

    return s

In [4]:
# Download 
series = []
for t in tickers:
    print(t)
    series.append(download_adj_close(t))

prices = pd.concat(series, axis=1).sort_index()

n_before = len(prices)

prices = prices.dropna()

n_after = len(prices)

display(prices.head())
print(f"Rows kept: {n_after} (dropped {n_before - n_after})")

CL=F
XLE
ICLN


Ticker,CL=F,XLE,ICLN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-02,60.369999,25.915483,8.423616
2018-01-03,61.630001,26.303602,8.405807
2018-01-04,62.009998,26.462381,8.343476
2018-01-05,61.439999,26.451788,8.37019
2018-01-08,61.73,26.610567,8.49485


Rows kept: 1761 (dropped 0)


In [5]:
# Data checks

assert prices.isna().sum().sum() == 0
prices.index.min(), prices.index.max()

(Timestamp('2018-01-02 00:00:00'), Timestamp('2024-12-31 00:00:00'))

In [6]:
# Save

prices.to_parquet(prices_path)