# 02 — Models & CV (Grouped by `cell_id`)

Compares multiple models, then a tuned Elastic Net with polynomial features on cells with ≥5 diagnostics.  
**Rubric hooks**: multiple appropriate models, cross-validation, tuning, results & analysis, discussion.

In [None]:
import os, re, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

CSV='results/rpt_features_labeled_enriched.csv'
df=pd.read_csv(CSV).sort_values(['cell_id','diag'], kind='mergesort')

src = df['source_file'].astype(str) if 'source_file' in df.columns else (df['cell_id'].astype(str)+'_diag'+df['diag'].astype(str))
def parse_rate(s):
    m=re.search(r'_([0-9]+(?:_[0-9]+)?)C', s, re.I)
    return float(m.group(1).replace('_','.')) if m else np.nan
def parse_temp(s):
    m=re.search(r'[Tt](\d{2})', s) or re.search(r'[Nn](\d{2})', s)
    return float(m.group(1)) if m else np.nan
if 'c_rate' not in df.columns: df['c_rate']=src.apply(parse_rate)
if 'temp_c' not in df.columns: df['temp_c']=src.apply(parse_temp)

X = df[['diag','capacity_ah','fade_frac','cap_slope_k3','c_rate','temp_c']].copy()
X['c_rate']=X['c_rate'].fillna(0.0)
X['temp_c']=X['temp_c'].fillna(X['temp_c'].median() if X['temp_c'].notna().any() else 23.0)
X['cap_slope_k3']=X['cap_slope_k3'].fillna(0.0)
y = df['RUL'].values
groups = df['cell_id'].values

def rmse(y_true,y_pred):
    return float(np.sqrt(mean_squared_error(y_true,y_pred)))
scoring={'rmse': make_scorer(rmse, greater_is_better=False),
         'mae': make_scorer(mean_absolute_error, greater_is_better=False),
         'r2': 'r2'}
cv=GroupKFold(n_splits=min(5, len(np.unique(groups))))

### CV leaderboard (baseline mean, diag-only LR, EN, RF, HGBM, SVR)

In [None]:
rows=[]
rm,ma,rr=[],[],[]
for tr,va in cv.split(X,y,groups=groups):
    yhat=np.full_like(y[va], y[tr].mean(), dtype=float)
    rm.append(rmse(y[va],yhat))
    ma.append(mean_absolute_error(y[va],yhat))
    rr.append(r2_score(y[va],yhat))
rows.append({'model':'baseline_mean','rmse_mean':np.mean(rm),'rmse_std':np.std(rm),
             'mae_mean':np.mean(ma),'mae_std':np.std(ma),'r2_mean':np.mean(rr)})

pipe_lr = Pipeline([('scaler',StandardScaler()),('lr',LinearRegression())])
cvres = cross_validate(pipe_lr, df[['diag']], y, groups=groups, cv=cv, scoring=scoring, n_jobs=-1)
rows.append({'model':'linear_diag_only','rmse_mean':-np.mean(cvres['test_rmse']),
             'rmse_std':np.std(-cvres['test_rmse']),
             'mae_mean':-np.mean(cvres['test_mae']),'mae_std':np.std(-cvres['test_mae']),
             'r2_mean':np.mean(cvres['test_r2'])})

models={
 'elastic_net':Pipeline([('scaler',StandardScaler()),('enet',ElasticNet(alpha=0.01,l1_ratio=0.2,max_iter=8000,random_state=42))]),
 'random_forest':RandomForestRegressor(n_estimators=500,random_state=42,n_jobs=-1),
 'hist_gbm':HistGradientBoostingRegressor(random_state=42),
 'svr_rbf':Pipeline([('scaler',StandardScaler()),('svr',SVR(kernel='rbf',C=10,gamma=0.001,epsilon=0.2))]),
}
for name,mdl in models.items():
    cvres = cross_validate(mdl, X, y, groups=groups, cv=cv, scoring=scoring, n_jobs=-1)
    rows.append({'model':name,'rmse_mean':-np.mean(cvres['test_rmse']),
                 'rmse_std':np.std(-cvres['test_rmse']),
                 'mae_mean':-np.mean(cvres['test_mae']),'mae_std':np.std(-cvres['test_mae']),
                 'r2_mean':np.mean(cvres['test_r2'])})
lb=pd.DataFrame(rows).sort_values('rmse_mean')
os.makedirs('results', exist_ok=True)
lb.to_csv('results/leaderboard.csv', index=False)
lb

### Tuned Elastic Net (poly) on cells with ≥5 diagnostics + OOF metrics

In [None]:
cnt=df.groupby('cell_id')['diag'].nunique()
keep=cnt[cnt>=5].index
df_f=df[df['cell_id'].isin(keep)].copy()

df_f['diag_norm']=df_f['diag']/df_f.groupby('cell_id')['diag'].transform('max')
df_f['cap_pct']=df_f['capacity_ah']/df_f.groupby('cell_id')['capacity_ah'].transform('first')

Xf=df_f[['diag','diag_norm','capacity_ah','cap_pct','fade_frac','cap_slope_k3','c_rate','temp_c']].copy()
Xf['cap_slope_k3']=Xf['cap_slope_k3'].fillna(0.0)
yf=df_f['RUL'].values
gf=df_f['cell_id'].values

pipe=Pipeline([('poly',PolynomialFeatures(2, include_bias=False)),('scaler',StandardScaler()),('enet',ElasticNet(max_iter=15000,random_state=42))])
param_grid={'enet__alpha':np.logspace(-3,1,15),'enet__l1_ratio':[0.05,0.2,0.5,0.8]}
cv2=GroupKFold(n_splits=min(5,len(np.unique(gf))))
scorer=make_scorer(lambda yt,yp: float(np.sqrt(mean_squared_error(yt,yp))), greater_is_better=False)
search=GridSearchCV(pipe, param_grid, cv=cv2, scoring=scorer, n_jobs=-1, refit=True)
search.fit(Xf,yf,groups=gf)
print('Best params:', search.best_params_, 'CV RMSE:', -search.best_score_)

oof=np.full_like(yf, np.nan, dtype=float)
for tr,va in cv2.split(Xf,yf,groups=gf):
    model=search.best_estimator_
    model.fit(Xf.iloc[tr], yf[tr])
    oof[va]=model.predict(Xf.iloc[va])

def rmse_fn(yt,yp):
    return float(np.sqrt(mean_squared_error(yt,yp)))

oof_rmse=rmse_fn(yf,oof)
oof_mae=mean_absolute_error(yf,oof)
oof_r2=r2_score(yf,oof)
print(f'OOF (filtered, tuned) — RMSE={oof_rmse:.3f}, MAE={oof_mae:.3f}, R^2={oof_r2:.3f}')

per_cell=(pd.DataFrame({'cell_id':df_f['cell_id'],'y':yf,'oof':oof})
            .groupby('cell_id')
            .apply(lambda g: pd.Series({
                'rmse':rmse_fn(g['y'],g['oof']),
                'mae':mean_absolute_error(g['y'],g['oof']),
                'r2':r2_score(g['y'],g['oof'])
            })).reset_index())
per_cell.to_csv('results/per_cell_oof_metrics_tuned.csv', index=False)

plt.figure(dpi=120)
plt.scatter(yf,oof,s=24,alpha=0.7)
lo,hi=float(min(np.nanmin(yf),np.nanmin(oof))), float(max(np.nanmax(yf),np.nanmax(oof)))
plt.plot([lo,hi],[lo,hi],'k--')
plt.xlabel('True RUL')
plt.ylabel('Predicted RUL (OOF)')
plt.title('Parity — Elastic Net (poly, filtered)')
plt.tight_layout()
plt.savefig('results/figs/parity_plot_oof_tuned.png')
plt.show()