In [2]:
# Import what we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

print("âœ… Libraries imported successfully!")

âœ… Libraries imported successfully!


In [7]:
# Load all three months of SPY options data
jan = pd.read_csv('../data/spy_eod_202301.txt')
feb = pd.read_csv('../data/spy_eod_202302.txt')
mar = pd.read_csv('../data/spy_eod_202303.txt')

# Combine them into one big dataset
df = pd.concat([jan, feb, mar], ignore_index=True)

print(f"âœ… Data loaded successfully!")
print(f"Total rows: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print(f"\nColumn names:")
for col in df.columns:
    print(f"  - {col}")

âœ… Data loaded successfully!
Total rows: 245,695
Number of columns: 33

Column names:
  - [QUOTE_UNIXTIME]
  -  [QUOTE_READTIME]
  -  [QUOTE_DATE]
  -  [QUOTE_TIME_HOURS]
  -  [UNDERLYING_LAST]
  -  [EXPIRE_DATE]
  -  [EXPIRE_UNIX]
  -  [DTE]
  -  [C_DELTA]
  -  [C_GAMMA]
  -  [C_VEGA]
  -  [C_THETA]
  -  [C_RHO]
  -  [C_IV]
  -  [C_VOLUME]
  -  [C_LAST]
  -  [C_SIZE]
  -  [C_BID]
  -  [C_ASK]
  -  [STRIKE]
  -  [P_BID]
  -  [P_ASK]
  -  [P_SIZE]
  -  [P_LAST]
  -  [P_DELTA]
  -  [P_GAMMA]
  -  [P_VEGA]
  -  [P_THETA]
  -  [P_RHO]
  -  [P_IV]
  -  [P_VOLUME]
  -  [STRIKE_DISTANCE]
  -  [STRIKE_DISTANCE_PCT]


In [8]:
# Remove spaces AND brackets from column names
df.columns = df.columns.str.strip().str.replace('[', '', regex=False).str.replace(']', '', regex=False)

print("âœ… Column names cleaned!")
print(f"\nNew column names (first 10):")
for col in df.columns[:10]:
    print(f"  - {col}")

# Keep only rows where we have actual call prices
calls = df[df['C_LAST'] > 0].copy()

print(f"\nâœ… Data filtered!")
print(f"Total call options with prices: {len(calls):,}")

âœ… Column names cleaned!

New column names (first 10):
  - QUOTE_UNIXTIME
  - QUOTE_READTIME
  - QUOTE_DATE
  - QUOTE_TIME_HOURS
  - UNDERLYING_LAST
  - EXPIRE_DATE
  - EXPIRE_UNIX
  - DTE
  - C_DELTA
  - C_GAMMA

âœ… Data filtered!
Total call options with prices: 190,657


In [9]:
# Features (X) - the inputs to predict option price
feature_cols = [
    'UNDERLYING_LAST',   # SPY price
    'STRIKE',            # Strike price
    'DTE',               # Days to expiration
    'C_IV',              # Implied volatility
    'C_DELTA',           # Delta
    'C_GAMMA',           # Gamma
    'C_VEGA',            # Vega
    'C_THETA'            # Theta
]

# Target (y) - what we want to predict
target_col = 'C_LAST'  # Call option price

# Create X and y
X = calls[feature_cols].copy()
y = calls[target_col].copy()

print(f"âœ… Features selected!")
print(f"X shape: {X.shape} (rows, features)")
print(f"y shape: {y.shape} (rows)")
print(f"\nChecking for missing values...")
print(X.isnull().sum())

âœ… Features selected!
X shape: (190657, 8) (rows, features)
y shape: (190657,) (rows)

Checking for missing values...
UNDERLYING_LAST    0
STRIKE             0
DTE                0
C_IV               0
C_DELTA            0
C_GAMMA            0
C_VEGA             0
C_THETA            0
dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"âœ… Data split complete!")
print(f"\nTraining set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"\nPrice statistics (training set):")
print(f"  Min: ${y_train.min():.2f}")
print(f"  Max: ${y_train.max():.2f}")
print(f"  Mean: ${y_train.mean():.2f}")
print(f"  Median: ${y_train.median():.2f}")

âœ… Data split complete!

Training set: 152,525 samples
Test set: 38,132 samples

Price statistics (training set):
  Min: $0.01
  Max: $291.19
  Mean: $36.45
  Median: $12.40


In [None]:
import pickle

# Save everything to a file
with open('../data/prepared_data.pkl', 'wb') as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

print("âœ… Data saved to 'data/prepared_data.pkl'")
print("\n" + "="*60)
print("ðŸŽ‰ DATA PREPARATION COMPLETE!")
print("="*60)
print("\nYou now have:")
print(f"  â€¢ {len(X_train):,} training samples")
print(f"  â€¢ {len(X_test):,} test samples")
print(f"  â€¢ 8 features per sample")
print(f"  â€¢ Ready for ML models!")
print("\nNext steps:")
print("  1. Build Random Forest model")
print("  2. Build XGBoost model")
print("  3. Build Neural Network model")
print("  4. Compare all models to Black-Scholes")