In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
import json
from datetime import datetime

sys.path.insert(0, str(Path.cwd().parent))
from src.data_preprocessing import download_stocks, train_test_split_timeseries

In [7]:
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA']
START_DATE = '2020-11-24'
END_DATE = '2025-11-24'
DATA_DIR = Path('../data/processed')
DATA_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_SIZE = 1040
TEST_SIZE = 214

In [9]:
print("Downloading stock data...")
log_returns = download_stocks(TICKERS, START_DATE, END_DATE)
print(f"Downloaded {log_returns.shape[0]} observations for {log_returns.shape[1]} stocks")

Downloading stock data...


  prices = yf.download(tickers, start=start_date, end=end_date,


Downloaded 1254 observations for 7 stocks


In [15]:
print("\nDATA QUALITY")
for ticker in TICKERS:
    missing = log_returns[ticker].isna().sum()

assert log_returns.isna().sum().sum() == 0, "Missing values found!"
print("No missing values")


DATA QUALITY
No missing values


In [17]:
train_data, test_data = train_test_split_timeseries(log_returns, TRAIN_SIZE, TEST_SIZE)
print(f"\nTrain: {train_data.shape[0]} obs | Test: {test_data.shape[0]} obs")



Train: 1040 obs | Test: 214 obs


In [19]:
with open(DATA_DIR / 'log_returns_full.pkl', 'wb') as f:
    pickle.dump(log_returns, f)

# Save train/test split
split_data = {
    'train': train_data,
    'test': test_data,
    'tickers': TICKERS,
    'train_size': TRAIN_SIZE,
    'test_size': TEST_SIZE
}
with open(DATA_DIR / 'train_test_split.pkl', 'wb') as f:
    pickle.dump(split_data, f)

# Save metadata
metadata = {
    'tickers': TICKERS,
    'start_date': START_DATE,
    'end_date': END_DATE,
    'total_obs': len(log_returns),
    'train_size': TRAIN_SIZE,
    'test_size': TEST_SIZE,
    'preprocessing_date': datetime.now().isoformat()
}
with open(DATA_DIR / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\n Data saved to {DATA_DIR}")
print(f"  - log_returns_full.pkl")
print(f"  - train_test_split.pkl")
print(f"  - metadata.json")


 Data saved to ..\data\processed
  - log_returns_full.pkl
  - train_test_split.pkl
  - metadata.json
