# 04 - Modeling

This notebook trains and evaluates models for predicting DEX token market cap:
- Baseline models (persistence, 7-day moving average)
- ElasticNet regression
- Random Forest regression

Uses nested time series cross-validation to prevent leakage.

In [None]:
import os
import sys
import json
from collections import defaultdict, Counter

sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

from src import preprocessing as prep
from src import evaluation as evals

In [None]:
# Load processed panel
panel_feat = pd.read_parquet('../data/processed/panel.parquet')
print(f"Loaded panel with shape: {panel_feat.shape}")

In [None]:
def make_baseline_predictions(y_log, dates, outer_splits=5, test_size=14):
    """Generate baseline predictions: persistence and 7-day moving average."""
    tscv = TimeSeriesSplit(n_splits=outer_splits, test_size=test_size)
    preds = {'persistence': [], 'ma7': []}
    
    for train_idx, test_idx in tscv.split(y_log):
        train_y = y_log.iloc[train_idx]
        test_y = y_log.iloc[test_idx]
        
        # Persistence: y_t = y_{t-1}
        pers = pd.Series(index=test_y.index, dtype=float)
        for i in test_y.index:
            if i - 1 in y_log.index:
                pers[i] = y_log.loc[i - 1]
            else:
                pers[i] = train_y.iloc[-1]
        
        # 7-day MA
        ma7 = pd.Series(index=test_y.index, dtype=float)
        full_hist = y_log.copy()
        for i in test_y.index:
            window = full_hist.loc[:i-1].tail(7)
            ma7[i] = window.mean() if len(window) > 0 else train_y.mean()
        
        preds['persistence'].append(pers)
        preds['ma7'].append(ma7)
    
    return {k: pd.concat(v).sort_index() for k, v in preds.items()}


def nested_cv_predictions(X, y, model_name, outer_splits=5, test_size=14):
    """Run nested CV producing out-of-fold predictions."""
    if model_name == 'ElasticNet':
        base = Pipeline([
            ('scaler', StandardScaler()),
            ('est', ElasticNet(max_iter=10000, random_state=42))
        ])
        param_grid = {
            'est__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10],
            'est__l1_ratio': [0.1, 0.5, 0.9],
        }
    elif model_name == 'RandomForest':
        base = RandomForestRegressor(random_state=42, n_jobs=-1)
        param_grid = {
            'n_estimators': [200, 500],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
        }
    else:
        raise ValueError('Unsupported model')
    
    outer = TimeSeriesSplit(n_splits=outer_splits, test_size=test_size)
    oof_pred = pd.Series(index=y.index, dtype=float)
    best_params = []
    
    for tr_idx, te_idx in outer.split(X):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        
        inner = TimeSeriesSplit(n_splits=3, test_size=7)
        gcv = GridSearchCV(base, param_grid, cv=inner, scoring='neg_mean_squared_error', n_jobs=-1)
        gcv.fit(X_tr, y_tr)
        
        best = gcv.best_estimator_
        best_params.append(gcv.best_params_)
        oof_pred.iloc[te_idx] = best.predict(X_te)
    
    return oof_pred.dropna(), best_params


def metrics_from_oof(y_log, yhat_log):
    """Compute metrics from out-of-fold predictions."""
    mask = yhat_log.index.intersection(y_log.index)
    y_true_log = y_log.loc[mask].values
    y_pred_log = yhat_log.loc[mask].values
    return evals.compute_metrics(y_true_log, y_pred_log)

In [None]:
# Train models per protocol
protocols = sorted(panel_feat['protocol'].dropna().unique())
all_records = []
decision_records = []
top_features_records = []

for proto in protocols:
    pdf = panel_feat[panel_feat['protocol'] == proto].sort_values('date').reset_index(drop=True)
    
    # Require at least 90 days with target
    if pdf['market_cap_circulating'].notna().sum() < 90:
        print(f"Skipping {proto}: insufficient data")
        continue
    
    print(f"\nProcessing {proto}...")
    
    X_full, y_full = prep.build_feature_matrix(pdf, target_col='market_cap_circulating')
    meta = X_full[['protocol', 'date']]
    X = X_full.drop(columns=['protocol', 'date'])
    y = y_full
    
    # Impute missing values
    X = X.fillna(method='ffill').fillna(0.0).reset_index(drop=True)
    y = y.reset_index(drop=True)
    meta = meta.reset_index(drop=True)
    
    # Baselines
    base_preds = make_baseline_predictions(y, meta['date'])
    for bname, preds in base_preds.items():
        mets = metrics_from_oof(y, preds)
        all_records.append({'protocol': proto, 'model': bname, **mets})
    
    # Learned models
    for model_name in ['ElasticNet', 'RandomForest']:
        yhat, best_params_list = nested_cv_predictions(X, y, model_name=model_name)
        mets = metrics_from_oof(y, yhat)
        all_records.append({'protocol': proto, 'model': model_name, **mets})
        decision_records.append({
            'protocol': proto,
            'model': model_name,
            'best_params_last_fold': json.dumps(best_params_list[-1] if best_params_list else {})
        })
        print(f"  {model_name}: RMSE=${mets['rmse_usd']:,.0f}, R2={mets['r2']:.3f}")

In [None]:
# Create metrics DataFrame
metrics_df = pd.DataFrame(all_records)
print("\nModel performance by protocol:")
metrics_df

In [None]:
# Save results
os.makedirs('../results/tables', exist_ok=True)
metrics_df.to_csv('../results/tables/metrics_per_protocol.csv', index=False)

decision_df = pd.DataFrame(decision_records)
decision_df.to_csv('../results/tables/decision_matrix.csv', index=False)

print("Results saved to results/tables/")