In [None]:
import os
import pandas as pd
import polars as pl
import numpy as np
from pathlib import Path

from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import kaggle_evaluation.default_inference_server

## Configuration

In [None]:
# Data path
DATA_PATH = Path('/kaggle/input/hull-tactical-market-prediction/')

# Signal conversion parameters (TUNE THESE FROM YOUR CV!)
SIGNAL_MULTIPLIER_ENET = 400.0
SIGNAL_MULTIPLIER_LGBM = 400.0

# Ensemble weights (TUNE THESE FROM YOUR CV!)
WEIGHT_ENET = 0.3
WEIGHT_LGBM = 0.7

# Date cutoff (remove early sparse data)
CUTOFF_DATE = 1000

# Global variables
MODEL_ENET = None
MODEL_LGBM = None
FEATURE_COLS = None
FITTED = False

## Helper Functions

In [None]:
def convert_return_to_signal(predicted_return, multiplier):
    """
    Map predicted return → allocation [0, 2]
    """
    signal = predicted_return * multiplier + 1.0
    return np.clip(signal, 0.0, 2.0)

## Model Training (Run Once)

In [None]:
def train_models():
    """
    Train both ElasticNet and LightGBM on full training data.
    Called once on first predict() call.
    """
    global MODEL_ENET, MODEL_LGBM, FEATURE_COLS, FITTED
    
    print("Loading training data...")
    train_df = pd.read_csv(DATA_PATH / 'train.csv')
    
    # Trim early sparse dates
    train_df = train_df[train_df['date_id'] >= CUTOFF_DATE].reset_index(drop=True)
    print(f"Training on {len(train_df)} samples (date_id >= {CUTOFF_DATE})")
    
    # Define features
    FEATURE_COLS = [c for c in train_df.columns 
                    if c.startswith(('M', 'E', 'I', 'P', 'V', 'S', 'MOM', 'D'))]
    
    X = train_df[FEATURE_COLS]
    y = train_df['market_forward_excess_returns']
    
    # Drop rows with NaN target
    mask = y.notna()
    X = X[mask]
    y = y[mask]
    
    print(f"Training samples after dropping NaN targets: {len(X)}")
    
    # ==================== Train ElasticNet ====================
    print("\nTraining ElasticNet...")
    MODEL_ENET = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('regressor', ElasticNet(
            alpha=0.1,  # Use best alpha from your CV!
            l1_ratio=0.5,
            max_iter=100000
        ))
    ])
    MODEL_ENET.fit(X, y)
    print("ElasticNet trained.")
    
    # ==================== Train LightGBM ====================
    print("\nTraining LightGBM...")
    X_lgbm = X.fillna(-999)
    train_set = lgb.Dataset(X_lgbm, label=y)
    
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'num_leaves': 63,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1
    }
    
    MODEL_LGBM = lgb.train(
        lgb_params,
        train_set,
        num_boost_round=3000,  # Use best n_round from your CV!
        valid_sets=[train_set],
        callbacks=[lgb.log_evaluation(0)]
    )
    print("LightGBM trained.")
    
    FITTED = True
    print("\n✅ All models trained and ready!")

## Prediction Function (Kaggle API)

In [None]:
def predict(test: pl.DataFrame) -> float:
    """
    Main prediction function called by Kaggle evaluation API.
    
    Parameters:
    -----------
    test : pl.DataFrame
        Polars DataFrame with features for one or more test samples
    
    Returns:
    --------
    float or np.ndarray : Allocation(s) between 0.0 and 2.0
    """
    global FITTED, MODEL_ENET, MODEL_LGBM, FEATURE_COLS
    
    # Train models on first call
    if not FITTED:
        train_models()
    
    # Convert to pandas
    test_pd = test.to_pandas()
    
    # Extract features
    X_test = test_pd[FEATURE_COLS]
    
    # ==================== ElasticNet Prediction ====================
    pred_returns_enet = MODEL_ENET.predict(X_test)
    signal_enet = convert_return_to_signal(pred_returns_enet, SIGNAL_MULTIPLIER_ENET)
    
    # ==================== LightGBM Prediction ====================
    X_test_lgbm = X_test.fillna(-999)
    pred_returns_lgbm = MODEL_LGBM.predict(X_test_lgbm)
    signal_lgbm = convert_return_to_signal(pred_returns_lgbm, SIGNAL_MULTIPLIER_LGBM)
    
    # ==================== Ensemble ====================
    final_signal = (
        WEIGHT_ENET * signal_enet +
        WEIGHT_LGBM * signal_lgbm
    )
    
    final_signal = np.clip(final_signal, 0.0, 2.0)
    
    # Return scalar if single sample, else array
    if len(final_signal) == 1:
        return float(final_signal[0])
    else:
        return final_signal.astype(np.float64)

## Launch Inference Server

In [None]:
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # Production mode: serve predictions to Kaggle
    inference_server.serve()
else:
    # Local testing mode
    inference_server.run_local_gateway((str(DATA_PATH),))

---
## ✅ Submission Checklist:
1. ✅ No leakage (no `true_targets` lookup)
2. ✅ Models trained only on past data
3. ✅ Proper signal conversion
4. ✅ Ensemble weights from CV
5. ✅ Kaggle API integration

**Before submitting:**
- Update `SIGNAL_MULTIPLIER_*` from CV results
- Update `WEIGHT_*` from CV results
- Test locally first!