# Synthetic Data Collection & EDAThis notebook generates a **synthetic** daily price dataset for Bitcoin and the S&P 500.Network egress is disabled in the execution environment, so live market data cannot be fetched yet.The synthetic data preserves a modest positive correlation to prototype the workflow until real data can be pulled.

In [1]:
import csv
from datetime import datetime, timedelta
import math, random
from pathlib import Path

random.seed(42)
start_date = datetime(2020, 1, 1)
num_days = 5 * 365
rho = 0.35
mu_sp, sigma_sp = 0.0003, 0.012
mu_btc, sigma_btc = 0.0008, 0.035
sp_level = 3200.0
btc_level = 7200.0
rows = []
prev_sp = sp_level
prev_btc = btc_level
for i in range(num_days):
    date = start_date + timedelta(days=i)
    if date.weekday() >= 5:
        continue
    sp_ret = random.gauss(mu_sp, sigma_sp)
    btc_shock = random.gauss(mu_btc, sigma_btc)
    btc_ret = rho * sp_ret + math.sqrt(1 - rho ** 2) * btc_shock
    prev_sp *= 1 + sp_ret
    prev_btc *= 1 + btc_ret
    rows.append({
        'Date': date.strftime('%Y-%m-%d'),
        'SP500_Close': round(prev_sp, 2),
        'BTC_Close': round(prev_btc, 2),
        'SP500_Return': sp_ret,
        'BTC_Return': btc_ret,
    })

csv_path = Path('data/raw/synthetic_btc_sp500.csv')
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open('w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys())
    writer.writeheader()
    writer.writerows(rows)

print(f'Saved synthetic dataset to {csv_path}')
print(f'Rows: {len(rows)}')
print(f'Date range: {rows[0]['Date']} to {rows[-1]['Date']}')

Saved synthetic dataset to data/raw/synthetic_btc_sp500.csv
Rows: 1303
Date range: 2020-01-01 to 2024-12-27


In [2]:
import csv, math
from statistics import mean

with open('data/raw/synthetic_btc_sp500.csv', newline='') as f:
    reader = list(csv.DictReader(f))

sp_rets = [float(r['SP500_Return']) for r in reader]
btc_rets = [float(r['BTC_Return']) for r in reader]
mean_sp = mean(sp_rets)
mean_btc = mean(btc_rets)
cov = sum((a - mean_sp) * (b - mean_btc) for a, b in zip(sp_rets, btc_rets)) / (len(sp_rets) - 1)
var_sp = sum((a - mean_sp) ** 2 for a in sp_rets) / (len(sp_rets) - 1)
var_btc = sum((b - mean_btc) ** 2 for b in btc_rets) / (len(btc_rets) - 1)
corr = cov / math.sqrt(var_sp * var_btc)

summary = [
    f'Records: {len(reader)} trading days',
    f'Date range: {reader[0]['Date']} to {reader[-1]['Date']}',
    f'S&P 500 close range: {min(float(r['SP500_Close']) for r in reader):.2f} to {max(float(r['SP500_Close']) for r in reader):.2f}',
    f'Bitcoin close range: {min(float(r['BTC_Close']) for r in reader):.2f} to {max(float(r['BTC_Close']) for r in reader):.2f}',
    f'Mean daily return — S&P 500: {mean_sp:.4f}',
    f'Mean daily return — Bitcoin: {mean_btc:.4f}',
    f'Std daily return — S&P 500: {math.sqrt(var_sp):.4f}',
    f'Std daily return — Bitcoin: {math.sqrt(var_btc):.4f}',
    f'Return correlation (BTC vs S&P 500): {corr:.3f}',
]

print('Summary statistics and correlation:')
for line in summary:
    print(line)

Summary statistics and correlation:


Records: 1303 trading days
Date range: 2020-01-01 to 2024-12-27
S&P 500 close range: 2634.70 to 4620.93
Bitcoin close range: 2100.86 to 14875.09
Mean daily return — S&P 500: 0.0000
Mean daily return — Bitcoin: 0.0009
Std daily return — S&P 500: 0.0121
Std daily return — Bitcoin: 0.0333
Return correlation (BTC vs S&P 500): 0.101
