# S&P 500 Data Exploration

This notebook explores the S&P 500 monthly dataset and prepares it for portfolio optimization.

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data_loader import (
    load_sp500_data,
    prepare_returns_data,
    filter_valid_assets,
    create_asset_universe,
    print_data_summary
)
from src.features import extract_all_features, prepare_feature_tensor

## 1. Load Data

In [None]:
# Load the S&P 500 dataset
# Update path if your file is in a different location
df = load_sp500_data('data/sp500_monthly.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## 2. Prepare Returns Data

In [None]:
# Prepare returns in wide format (dates x assets)
returns = prepare_returns_data(df)

print(f"Returns shape: {returns.shape}")
print(f"Date range: {returns.index.min()} to {returns.index.max()}")
print(f"\nFirst few dates and assets:")
returns.iloc[:5, :5]

## 3. Filter Valid Assets

In [None]:
# Filter to assets with sufficient data
returns_filtered = filter_valid_assets(
    returns,
    min_periods=24,  # At least 2 years
    max_missing_pct=0.2  # Max 20% missing
)

print_data_summary(returns_filtered)

## 4. Data Quality Checks

In [None]:
# Check for extreme returns (potential data errors)
extreme_returns = (returns_filtered.abs() > 0.5)  # > 50% in one month
print(f"Extreme returns (>50%): {extreme_returns.sum().sum()} instances")

# Check distribution
print(f"\nReturn statistics:")
print(returns_filtered.describe().T.head(10))

## 5. Extract Features

In [None]:
# Extract features for portfolio optimization
features = extract_all_features(returns_filtered)

print("Available features:")
for name, feat_df in features.items():
    print(f"  {name}: shape {feat_df.shape}, missing: {feat_df.isnull().sum().sum()}")

## 6. Prepare Feature Tensor

In [None]:
# Prepare feature tensor for PyTorch
feature_tensor, date_index, asset_index = prepare_feature_tensor(returns_filtered)

print(f"Feature tensor shape: {feature_tensor.shape}")
print(f"  - Time periods: {feature_tensor.shape[0]}")
print(f"  - Assets: {feature_tensor.shape[1]}")
print(f"  - Features: {feature_tensor.shape[2]}")
print(f"\nDate range: {date_index.min()} to {date_index.max()}")
print(f"\nFirst 10 assets: {asset_index[:10].tolist()}")

## 7. Visualizations

In [None]:
# Plot number of assets over time
n_assets_over_time = returns_filtered.notna().sum(axis=1)

plt.figure(figsize=(12, 4))
plt.plot(n_assets_over_time.index, n_assets_over_time.values)
plt.title('Number of Assets in Universe Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Assets')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Plot average returns over time
avg_returns = returns_filtered.mean(axis=1)

plt.figure(figsize=(12, 4))
plt.plot(avg_returns.index, avg_returns.values)
plt.title('Average Monthly Return Over Time')
plt.xlabel('Date')
plt.ylabel('Average Return')
plt.grid(True)
plt.tight_layout()
plt.show()

## 8. Save Processed Data

In [None]:
# Save processed returns and features for use in optimization
# Uncomment to save:
# returns_filtered.to_csv('data/processed_returns.csv')
# np.save('data/feature_tensor.npy', feature_tensor)
# pd.Series(date_index).to_csv('data/date_index.csv')
# pd.Series(asset_index).to_csv('data/asset_index.csv')

print("Data exploration complete!")