## S&P 500 Daily Return Prediction Using Machine Learning

#### Goal: Predict next-day S&P 500 returns using multi-source market, macroeconomic, interest rate, volatility, sentiment, and momentum features.

##  Business Problem

Accurately predicting daily S&P 500 returns is valuable for:
- Portfolio Optimization
- Risk Management
- Algorithmic Trading Strategy Design

In this project, we use historical market, macro, sentiment, and volatility-based indicators to predict next-day index returns using machine learning.


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

train_df.head()


Train shape: (9021, 98)
Test shape: (10, 99)


Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns
0,0,0,0,0,1,1,0,0,0,1,...,,,,,,,,-0.002421,0.000301,-0.003038
1,1,0,0,0,1,1,0,0,0,1,...,,,,,,,,-0.008495,0.000303,-0.009114
2,2,0,0,0,1,0,0,0,0,1,...,,,,,,,,-0.009624,0.000301,-0.010243
3,3,0,0,0,1,0,0,0,0,0,...,,,,,,,,0.004662,0.000299,0.004046
4,4,0,0,0,1,0,0,0,0,0,...,,,,,,,,-0.011686,0.000299,-0.012301


#### Target & Feature Separation

In [4]:
target = "forward_returns"

X = train_df.drop(columns=[
    "date_id",
    "forward_returns",
    "risk_free_rate",
    "market_forward_excess_returns"
])

y = train_df[target]


In [16]:
### ADD LAG FEATURES (MOST IMPORTANT)

### Markets are highly auto-correlated.
### Yesterday’s return strongly affects today’s return.



# ============================
# Add Lag Features 
# ============================

lag_features = ["M1", "M2", "V1", "V2", "MOM1", "MOM2"]

for col in lag_features:
    if col in train_df.columns:
        X[f"{col}_lag1"] = train_df[col].shift(1)
        X[f"{col}_lag2"] = train_df[col].shift(2)

# Drop first 2 rows (because lag creates NaN at top)
X = X.iloc[2:]
y = y.iloc[2:]

print(" Lag features added")
print("New shape:", X.shape)

 Lag features added
New shape: (9019, 94)


In [29]:
# ============================
# Create Lag Features for TEST Data
# ============================

for col in ["M1", "M2", "V1", "V2", "MOM1", "MOM2"]:
    if col in test_df.columns:
        test_df[f"{col}_lag1"] = test_df[col].shift(1)
        test_df[f"{col}_lag2"] = test_df[col].shift(2)

# Fill newly created lag NaNs safely
test_df = test_df.fillna(test_df.median())

print(" Lag features added to test data")


 Lag features added to test data


#### handling missing values

In [30]:
missing_ratio = X.isnull().mean()

valid_features = missing_ratio[missing_ratio < 0.40].index
X = X[valid_features]

X = X.fillna(X.median())

print("Final number of usable features:", X.shape[1])


Final number of usable features: 92


##### dataset has MANY missing values

##### This prevents model crashes

##### Keeps only reliable columns

In [31]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


#### Trainig testing split 

### Feature Scaling

In [32]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


In [33]:
##Linear Regression (Baseline)

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

lr_preds = lr_model.predict(X_valid_scaled)
lr_rmse = np.sqrt(mean_squared_error(y_valid, lr_preds))

print("Linear Regression RMSE:", lr_rmse)

Linear Regression RMSE: 0.011653394019084978


In [34]:
### Random Forest (Stronger)


rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)

rf_preds = rf_model.predict(X_valid_scaled)
rf_rmse = np.sqrt(mean_squared_error(y_valid, rf_preds))

print("Random Forest RMSE:", rf_rmse)

Random Forest RMSE: 0.011234823323770793


In [35]:
#### XGBoost (Strongest)


xgb_model = XGBRegressor(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train_scaled, y_train)

xgb_preds = xgb_model.predict(X_valid_scaled)
xgb_rmse = np.sqrt(mean_squared_error(y_valid, xgb_preds))

print("XGBoost RMSE:", xgb_rmse)

XGBoost RMSE: 0.01219760731510223


### Walk-Forward Time Series Validation (MODEL SELECTION)

In [36]:
def walk_forward_validation(model, X, y, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    fold_rmse = []

    fold = 1
    for train_index, val_index in tscv.split(X):
        X_train_fold = X[train_index]
        X_val_fold = X[val_index]

        y_train_fold = y.iloc[train_index]
        y_val_fold = y.iloc[val_index]

        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)

        rmse = np.sqrt(mean_squared_error(y_val_fold, preds))
        fold_rmse.append(rmse)

        print(f"Fold {fold} RMSE: {rmse:.6f}")
        fold += 1

    print(" Average Walk-Forward RMSE:", np.mean(fold_rmse))
    return np.mean(fold_rmse)


### Apply Walk-Forward to All Models

In [37]:
lr_wf_rmse = walk_forward_validation(LinearRegression(), X_train_scaled, y_train)
rf_wf_rmse = walk_forward_validation(
    RandomForestRegressor(n_estimators=300, max_depth=6, random_state=42, n_jobs=-1),
    X_train_scaled, y_train
)
xgb_wf_rmse = walk_forward_validation(
    XGBRegressor(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ),
    X_train_scaled, y_train
)


Fold 1 RMSE: 0.589234
Fold 2 RMSE: 0.055614
Fold 3 RMSE: 0.020992
Fold 4 RMSE: 0.187454
Fold 5 RMSE: 0.015743
 Average Walk-Forward RMSE: 0.1738073220302471
Fold 1 RMSE: 0.011439
Fold 2 RMSE: 0.013347
Fold 3 RMSE: 0.011425
Fold 4 RMSE: 0.011587
Fold 5 RMSE: 0.007779
 Average Walk-Forward RMSE: 0.011115237710339274
Fold 1 RMSE: 0.010817
Fold 2 RMSE: 0.014034
Fold 3 RMSE: 0.012076
Fold 4 RMSE: 0.012412
Fold 5 RMSE: 0.009590
 Average Walk-Forward RMSE: 0.01178572623764568


### Select Best Model + Train Final Model

In [38]:
wf_scores = {
    "LinearRegression": lr_wf_rmse,
    "RandomForest": rf_wf_rmse,
    "XGBoost": xgb_wf_rmse
}

best_model_name = min(wf_scores, key=wf_scores.get)
print("Best model:", best_model_name)

if best_model_name == "LinearRegression":
    final_model = LinearRegression()
elif best_model_name == "RandomForest":
    final_model = RandomForestRegressor(
        n_estimators=300, max_depth=6, random_state=42, n_jobs=-1
    )
else:
    final_model = XGBRegressor(
        n_estimators=400, max_depth=5, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        objective="reg:squarederror", random_state=42, n_jobs=-1
    )

final_model.fit(X_train_scaled, y_train)


Best model: RandomForest


0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Prepare Test Data + Predict + Save Kaggle Submission

In [41]:
X_test = test_df[X.columns]
X_test = X_test.fillna(X.median())

X_test_scaled = scaler.transform(X_test)

test_predictions = final_model.predict(X_test_scaled)

test_df["predicted_forward_returns"] = test_predictions

submission = test_df[["date_id", "predicted_forward_returns"]]
submission.to_csv("submission.csv", index=False)

print(" submission.csv saved successfully!")


 submission.csv saved successfully!
