In [None]:
import pandas as pd
import numpy as np
import polars as pl
from pathlib import Path

from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

# Import your evaluation framework
# (Copy the portfolio_score and time_series_cv functions here or import them)

## Load & Preprocess Data

In [None]:
DATA_PATH = Path('./hull-tactical-market-prediction')
train_df = pd.read_csv(DATA_PATH / 'train.csv')

print(f"Original shape: {train_df.shape}")
print(f"Date range: {train_df['date_id'].min()} - {train_df['date_id'].max()}")

# Feature columns
feature_cols = [c for c in train_df.columns if c.startswith(('M', 'E', 'I', 'P', 'V', 'S', 'MOM', 'D'))]
target_col = 'market_forward_excess_returns'  # Normalized target

print(f"\nFeature groups:")
for prefix in ['M', 'E', 'I', 'P', 'V', 'S', 'MOM', 'D']:
    cols = [c for c in feature_cols if c.startswith(prefix)]
    print(f"  {prefix}*: {len(cols)} features")

print(f"\nTarget: {target_col}")

In [None]:
# Check missingness over time
train_df['missing_count'] = train_df[feature_cols].isnull().sum(axis=1)
train_df['missing_pct'] = train_df['missing_count'] / len(feature_cols) * 100

plt.figure(figsize=(14, 4))
plt.scatter(train_df['date_id'], train_df['missing_pct'], s=1, alpha=0.5)
plt.axhline(50, color='r', linestyle='--', label='50% missing')
plt.xlabel('date_id')
plt.ylabel('% Missing Features')
plt.title('Missingness Over Time')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Decide on a cutoff
CUTOFF_DATE = 1000  # Trim early sparse data
print(f"\nTrimming dates < {CUTOFF_DATE}")
train_df = train_df[train_df['date_id'] >= CUTOFF_DATE].reset_index(drop=True)
print(f"New shape: {train_df.shape}")

## Model 1: ElasticNet Baseline

In [None]:
SIGNAL_MULTIPLIER = 400.0  # Tune this!

def convert_return_to_signal(predicted_return, multiplier=SIGNAL_MULTIPLIER):
    """
    Map predicted return → allocation [0, 2]
    
    Logic:
    - predicted_return = 0 → signal = 1.0 (neutral)
    - positive return → > 1 (up to 2)
    - negative return → < 1 (down to 0)
    """
    signal = predicted_return * multiplier + 1.0
    return np.clip(signal, 0.0, 2.0)

class ElasticNetModel:
    def __init__(self, signal_multiplier=400.0):
        self.signal_multiplier = signal_multiplier
        self.model = None
        self.feature_cols = None
    
    def fit(self, train_fold):
        self.feature_cols = [c for c in train_fold.columns 
                             if c.startswith(('M', 'E', 'I', 'P', 'V', 'S', 'MOM', 'D'))]
        
        X = train_fold[self.feature_cols]
        y = train_fold['market_forward_excess_returns']
        
        # Drop rows with NaN target
        mask = y.notna()
        X = X[mask]
        y = y[mask]
        
        # Pipeline: impute → scale → ElasticNet
        self.model = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('regressor', ElasticNetCV(
                l1_ratio=0.5,
                alphas=np.logspace(-4, 2, 50),
                max_iter=100000,
                cv=3
            ))
        ])
        
        self.model.fit(X, y)
    
    def predict(self, test_fold):
        X_test = test_fold[self.feature_cols]
        predicted_returns = self.model.predict(X_test)
        signals = convert_return_to_signal(predicted_returns, self.signal_multiplier)
        return signals

In [None]:
# Cross-validate ElasticNet
def elasticnet_predict_wrapper(train_fold, test_fold):
    model = ElasticNetModel(signal_multiplier=400.0)
    model.fit(train_fold)
    return model.predict(test_fold)

# results_enet = time_series_cv(elasticnet_predict_wrapper, train_df, n_splits=5, test_size=180)
# plot_cv_results(results_enet)

## Model 2: LightGBM

In [None]:
class LGBMModel:
    def __init__(self, signal_multiplier=400.0, lgb_params=None):
        self.signal_multiplier = signal_multiplier
        self.lgb_params = lgb_params or {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'num_leaves': 63,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1
        }
        self.model = None
        self.feature_cols = None
    
    def fit(self, train_fold):
        self.feature_cols = [c for c in train_fold.columns 
                             if c.startswith(('M', 'E', 'I', 'P', 'V', 'S', 'MOM', 'D'))]
        
        X = train_fold[self.feature_cols].fillna(-999)  # LGB can handle missing values
        y = train_fold['market_forward_excess_returns']
        
        mask = y.notna()
        X = X[mask]
        y = y[mask]
        
        train_set = lgb.Dataset(X, label=y)
        self.model = lgb.train(
            self.lgb_params,
            train_set,
            num_boost_round=3000,
            valid_sets=[train_set],
            callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)]
        )
    
    def predict(self, test_fold):
        X_test = test_fold[self.feature_cols].fillna(-999)
        predicted_returns = self.model.predict(X_test)
        signals = convert_return_to_signal(predicted_returns, self.signal_multiplier)
        return signals

In [None]:
# Cross-validate LightGBM
def lgbm_predict_wrapper(train_fold, test_fold):
    model = LGBMModel(signal_multiplier=400.0)
    model.fit(train_fold)
    return model.predict(test_fold)

# results_lgbm = time_series_cv(lgbm_predict_wrapper, train_df, n_splits=5, test_size=180)
# plot_cv_results(results_lgbm)

## Model 3: Simple Ensemble

In [None]:
class EnsembleModel:
    def __init__(self, weights=None):
        self.weights = weights or [0.5, 0.5]  # [ElasticNet, LGBM]
        self.model_enet = ElasticNetModel(signal_multiplier=400.0)
        self.model_lgbm = LGBMModel(signal_multiplier=400.0)
    
    def fit(self, train_fold):
        self.model_enet.fit(train_fold)
        self.model_lgbm.fit(train_fold)
    
    def predict(self, test_fold):
        pred_enet = self.model_enet.predict(test_fold)
        pred_lgbm = self.model_lgbm.predict(test_fold)
        
        ensemble_pred = (
            self.weights[0] * pred_enet +
            self.weights[1] * pred_lgbm
        )
        
        return np.clip(ensemble_pred, 0.0, 2.0)

In [None]:
# Cross-validate Ensemble
def ensemble_predict_wrapper(train_fold, test_fold):
    model = EnsembleModel(weights=[0.3, 0.7])  # Tune these!
    model.fit(train_fold)
    return model.predict(test_fold)

# results_ensemble = time_series_cv(ensemble_predict_wrapper, train_df, n_splits=5, test_size=180)
# plot_cv_results(results_ensemble)

## Compare All Models

In [None]:
# Uncomment to run full comparison
# results = {
#     'Constant 0.8': results_08,
#     'Constant 1.0': results_10,
#     'ElasticNet': results_enet,
#     'LightGBM': results_lgbm,
#     'Ensemble': results_ensemble
# }

# comparison_df = pd.DataFrame({
#     'Model': list(results.keys()),
#     'Mean Score': [r['mean_score'] for r in results.values()],
#     'Std Score': [r['std_score'] for r in results.values()]
# }).sort_values('Mean Score', ascending=False)

# display(comparison_df)

# # Plot comparison
# plt.figure(figsize=(10, 5))
# plt.barh(comparison_df['Model'], comparison_df['Mean Score'])
# plt.xlabel('Mean CV Score')
# plt.title('Model Comparison')
# plt.grid(alpha=0.3)
# plt.tight_layout()
# plt.show()

---
## ✅ Next Steps:
1. Tune `SIGNAL_MULTIPLIER` for each model
2. Tune ensemble weights
3. Add feature engineering
4. Create submission notebook with best model