# Real Market Data Collection & EDA

This notebook downloads real daily price data for Bitcoin (BTC-USD) and the S&P 500 (^GSPC) using Yahoo Finance via the `yfinance` Python package.

The data is saved to a CSV file and used for exploratory data analysis (EDA), including summary statistics and correlation calculations.

In [13]:
# Install yfinance if not already installed
import sys
!{sys.executable} -m pip install yfinance --quiet

import yfinance as yf
import pandas as pd
from pathlib import Path

start_date = "2020-01-01"
end_date = pd.Timestamp.today().strftime("%Y-%m-%d")

def download_close(ticker: str, out_col: str) -> pd.DataFrame:
    data = yf.download(ticker, start=start_date, end=end_date, progress=False)

    # Handle both single-level and MultiIndex column outputs from yfinance
    if isinstance(data.columns, pd.MultiIndex):
        close = data["Close"]
        # close can be a DataFrame (ticker level present) or a Series
        if isinstance(close, pd.DataFrame):
            if ticker in close.columns:
                close = close[ticker]
            else:
                close = close.iloc[:, 0]
    else:
        close = data["Close"]

    return close.rename(out_col).to_frame()

sp500 = download_close("^GSPC", "SP500_Close")
btc = download_close("BTC-USD", "BTC_Close")

# Align on date index
df = pd.concat([sp500, btc], axis=1).dropna().reset_index()

# Make sure the date column is consistently named and formatted
if "Date" not in df.columns:
    df.rename(columns={df.columns[0]: "Date"}, inplace=True)
df["Date"] = pd.to_datetime(df["Date"]).dt.strftime("%Y-%m-%d")

df["SP500_Return"] = df["SP500_Close"].pct_change()
df["BTC_Return"] = df["BTC_Close"].pct_change()
df = df.dropna().reset_index(drop=True)

csv_path = Path("data/raw/synthetic_btc_sp500.csv")
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)

print(f"Saved real dataset to {csv_path}")
print(f"Rows: {len(df)}")
print(f"Date range: {df['Date'].iloc[0]} to {df['Date'].iloc[-1]}")


Saved real dataset to data/raw/synthetic_btc_sp500.csv
Rows: 1513
Date range: 2020-01-03 to 2026-01-09


In [15]:
import pandas as pd
import math
from statistics import mean

df = pd.read_csv('data/raw/synthetic_btc_sp500.csv')

sp_rets = df['SP500_Return']
btc_rets = df['BTC_Return']
mean_sp = sp_rets.mean()
mean_btc = btc_rets.mean()
cov = ((sp_rets - mean_sp) * (btc_rets - mean_btc)).mean()
var_sp = sp_rets.var()
var_btc = btc_rets.var()
corr = cov / math.sqrt(var_sp * var_btc)

summary = [
    f'Records: {len(df)} trading days',
    f'Date range: {df["Date"].iloc[0]} to {df["Date"].iloc[-1]}',
    f'S&P 500 close range: {df["SP500_Close"].min():.2f} to {df["SP500_Close"].max():.2f}',
    f'Bitcoin close range: {df["BTC_Close"].min():.2f} to {df["BTC_Close"].max():.2f}',
    f'Mean daily return — S&P 500: {mean_sp:.4f}',
    f'Mean daily return — Bitcoin: {mean_btc:.4f}',
    f'Std daily return — S&P 500: {sp_rets.std():.4f}',
    f'Std daily return — Bitcoin: {btc_rets.std():.4f}',
    f'Return correlation (BTC vs S&P 500): {corr:.3f}',
]

print('Summary statistics and correlation:')
for line in summary:
    print(line)

Summary statistics and correlation:
Records: 1513 trading days
Date range: 2020-01-03 to 2026-01-09
S&P 500 close range: 2237.40 to 6966.28
Bitcoin close range: 4970.79 to 124752.53
Mean daily return — S&P 500: 0.0006
Mean daily return — Bitcoin: 0.0025
Std daily return — S&P 500: 0.0132
Std daily return — Bitcoin: 0.0386
Return correlation (BTC vs S&P 500): 0.375
