# 1. Preprocess Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

spy = pd.read_csv('SPY.csv', parse_dates=['Dt'])
xlk = pd.read_csv('XLK.csv', parse_dates=['Dt'])
xlb = pd.read_csv('XLB.csv', parse_dates=['Dt'])
xlf = pd.read_csv('XLF.csv', parse_dates=['Dt'])
data_sources = {'SPY': spy, 'XLK': xlk, 'XLB': xlb, 'XLF': xlf}

lags = [5, 10, 21, 42]
lag_tags = {5: '1W', 10: '2W', 21: '1M', 42: '2M'}

def compute_returns(df, lags):
    returns = {f'return_{lag}': df['Close'].pct_change(lag) for lag in lags}
    return pd.DataFrame(returns)

features = []
for ticker, df in data_sources.items():
    df.set_index('Dt', inplace=True)
    df.dropna(inplace=True)
    returns = compute_returns(df, lags)
    returns.columns = [f'{ticker}_return_{lag_tags[lag]}' for lag in lags]
    features.append(returns)

merged_df = pd.concat(features, axis=1).dropna()


# 2. Choose Lagged Returns of SPY and Other 3 Indices

In [None]:
window_size = 100
prediction_results = []

for start in range(0, len(merged_df) - window_size, 21):
    train_data = merged_df.iloc[start : start + window_size]
    test_data = merged_df.iloc[start + window_size : start + window_size + 1]
    
    if test_data.empty:
        break
    
    X_train, y_train = train_data.drop(columns=['SPY_return_1W']), train_data['SPY_return_1W']
    X_test = test_data.drop(columns=['SPY_return_1W'])

    model = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test)[0]
    signal = 'Long' if prediction > 0 else 'Short'
    prediction_results.append({'Date': test_data.index[0], 'Prediction': prediction, 'Signal': signal})

signals_df = pd.DataFrame(prediction_results)
signals_df.to_csv('momentum_signals.csv', index=False)


# 3. Calibrate Model

In [None]:
spy.rename(columns={'Close': 'Close_SPY'}, inplace=True)
xlb.rename(columns={'Close': 'Close_XLB'}, inplace=True)
xlf.rename(columns={'Close': 'Close_XLF'}, inplace=True)
merged_data = spy.merge(xlb, on='Dt', how='inner').merge(xlf, on='Dt', how='inner')

merged_data['SPY_return'] = merged_data['Close_SPY'].pct_change()
merged_data['XLB_return'] = merged_data['Close_XLB'].pct_change()
merged_data['XLF_return'] = merged_data['Close_XLF'].pct_change()

for lag in lags:
    for ticker in ['SPY', 'XLB', 'XLF']:
        merged_data[f'{ticker}_{lag_tags[lag]}'] = merged_data[f'{ticker}_return'].shift(lag)

merged_data.dropna(inplace=True)


# 4. Predict the Next Day's Return and Make Short/Long Decisions

In [None]:
X = merged_data[[col for col in merged_data.columns if 'return' in col and col != 'SPY_return']]
y = merged_data['SPY_return']

models = {
    "OLS": LinearRegression(),
    'Ridge': Ridge(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.01, l1_ratio=0.5),
    'Lasso': Lasso(alpha=0.01),
    'KNN': KNeighborsRegressor(n_neighbors=1),
}

cumulative_returns = {name: pd.Series(dtype='float64') for name in models.keys()}
train_size = 100
step_size = 20
test_size = len(merged_data) - train_size

for start in range(0, test_size, step_size):
    end = start + train_size
    X_train, y_train = X.iloc[start:end], y.iloc[start:end]
    X_test, y_test = X.iloc[end:end+step_size], y.iloc[end:end+step_size]

    if y_test.isna().any():
        continue

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        signals = np.where(y_pred > 0, 1, -1) 
        strategy_returns = signals * y_test

        new_cumulative_returns = (1 + strategy_returns).cumprod()

        new_cumulative_returns = new_cumulative_returns.dropna()
        
        if not new_cumulative_returns.empty:
            cumulative_returns[name] = pd.concat([cumulative_returns[name], new_cumulative_returns])

for name in models.keys():
    final_return = cumulative_returns[name].iloc[-1] if not cumulative_returns[name].empty else "nan"
    print(f"{name} final cumulative return: {final_return}")


# 5. Evaluate 5 Learning Algorithms

In [None]:
plt.figure(figsize=(10, 6))
for name in models.keys():
    if not cumulative_returns[name].empty:
        plt.plot(cumulative_returns[name].values, label=name)

plt.title('Cumulative Returns of Different Models (Rolling Calibration)')
plt.xlabel('Time')
plt.ylabel('Cumulative Return')
plt.legend()
plt.show()


# 6. Conclusion


In this analysis, OLS and Ridge Regression produced the highest cumulative returns, both at 1.205, which suggests that a simple linear model effectively captures the momentum patterns in the data. ElasticNet performed slightly better with a cumulative return of 1.208, likely benefiting from its balanced regularization approach. On the other hand, Lasso had a lower cumulative return of 1.056, possibly due to its high regularization, which may have overly penalized important signals. KNN showed a moderate return of 1.164, indicating that while non-linear models can be useful, further tuning of hyperparameters could improve performance. Overall, linear models—particularly Ridge Regression—seem to be the most effective for short-term momentum-based trading strategies in this case.