# Real Market Data Collection & EDA

This notebook downloads real daily price data for Bitcoin (BTC-USD) and the S&P 500 (^GSPC) using Yahoo Finance via the `yfinance` Python package.

The data is saved to a CSV file and used for exploratory data analysis (EDA), including summary statistics and correlation calculations.

In [None]:
# Install yfinance if not already installed
import sys
!{sys.executable} -m pip install yfinance --quiet

import yfinance as yf
import pandas as pd
from pathlib import Path

# Download daily price data for S&P 500 (^GSPC) and Bitcoin (BTC-USD)
start_date = '2020-01-01'
end_date = pd.Timestamp.today().strftime('%Y-%m-%d')
sp500 = yf.download('^GSPC', start=start_date, end=end_date)[['Close']]
btc = yf.download('BTC-USD', start=start_date, end=end_date)[['Close']]

# Merge on date
df = pd.merge(sp500, btc, left_index=True, right_index=True, suffixes=('_SP500', '_BTC'))
df = df.reset_index().rename(columns={'Date': 'Date', 'Close_SP500': 'SP500_Close', 'Close_BTC': 'BTC_Close'})
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d') if 'Date' in df.columns else df['index'].dt.strftime('%Y-%m-%d')
df['SP500_Return'] = df['SP500_Close'].pct_change()
df['BTC_Return'] = df['BTC_Close'].pct_change()
df = df.dropna().reset_index(drop=True)

csv_path = Path('data/raw/synthetic_btc_sp500.csv')
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)

print(f'Saved real dataset to {csv_path}')
print(f'Rows: {len(df)}')
print(f'Date range: {df['Date'].iloc[0]} to {df['Date'].iloc[-1]}')

Saved synthetic dataset to data/raw/synthetic_btc_sp500.csv
Rows: 1303
Date range: 2020-01-01 to 2024-12-27


In [None]:
import csv, math
from statistics import mean

with open('data/raw/synthetic_btc_sp500.csv', newline='') as f:
    reader = list(csv.DictReader(f))

sp_rets = [float(r['SP500_Return']) for r in reader]
btc_rets = [float(r['BTC_Return']) for r in reader]
mean_sp = mean(sp_rets)
mean_btc = mean(btc_rets)
cov = sum((a - mean_sp) * (b - mean_btc) for a, b in zip(sp_rets, btc_rets)) / (len(sp_rets) - 1)
var_sp = sum((a - mean_sp) ** 2 for a in sp_rets) / (len(sp_rets) - 1)
var_btc = sum((b - mean_btc) ** 2 for b in btc_rets) / (len(btc_rets) - 1)
corr = cov / math.sqrt(var_sp * var_btc)

summary = [
    f'Records: {len(reader)} trading days',
    f'Date range: {reader[0]['Date']} to {reader[-1]['Date']}',
    f'S&P 500 close range: {min(float(r['SP500_Close']) for r in reader):.2f} to {max(float(r['SP500_Close']) for r in reader):.2f}',
    f'Bitcoin close range: {min(float(r['BTC_Close']) for r in reader):.2f} to {max(float(r['BTC_Close']) for r in reader):.2f}',
    f'Mean daily return — S&P 500: {mean_sp:.4f}',
    f'Mean daily return — Bitcoin: {mean_btc:.4f}',
    f'Std daily return — S&P 500: {math.sqrt(var_sp):.4f}',
    f'Std daily return — Bitcoin: {math.sqrt(var_btc):.4f}',
    f'Return correlation (BTC vs S&P 500): {corr:.3f}',
]

print('Summary statistics and correlation:')
for line in summary:
    print(line)

Summary statistics and correlation:


Records: 1303 trading days
Date range: 2020-01-01 to 2024-12-27
S&P 500 close range: 2634.70 to 4620.93
Bitcoin close range: 2100.86 to 14875.09
Mean daily return — S&P 500: 0.0000
Mean daily return — Bitcoin: 0.0009
Std daily return — S&P 500: 0.0121
Std daily return — Bitcoin: 0.0333
Return correlation (BTC vs S&P 500): 0.101
