In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet(
    "/content/canonical_checkpoint_features_with_soh_and_split.parquet"
)

df.head()


Unnamed: 0,asset_id,cycle_id,V_mean,V_std,V_min,V_max,V_range,dV_dt_mean,dV_dt_max,T_mean,...,T_delta_base,duration_base,use_dV_dt,V_range_norm,T_delta_norm,duration_norm,dV_dt_norm,Degradation_Index,SOH_proxy,split
0,0,0,3.735076,0.245422,2.699819,4.191235,1.491416,-35.948592,38.111247,40.41934,...,1.335614,0.038896,True,0.692878,0.66949,0.729033,0.741807,0.708302,0.49248,train
1,0,1,3.735681,0.244972,2.699859,4.192679,1.492819,-36.342517,34.65244,40.327115,...,1.335614,0.038896,True,0.693348,0.688454,0.722873,0.693107,0.699446,0.496861,train
2,0,10,3.739525,0.240002,2.699924,4.192582,1.492658,-38.02239,41.584546,40.363359,...,1.335614,0.038896,True,0.693295,0.693133,0.699661,0.788435,0.718631,0.487419,train
3,0,11,3.739852,0.239651,2.699803,4.192502,1.492699,-38.328526,34.646588,40.433203,...,1.335614,0.038896,True,0.693308,0.711655,0.697407,0.693023,0.698848,0.497158,train
4,0,12,3.740231,0.238893,2.699924,4.192462,1.492538,-38.309224,34.655228,40.39325,...,1.335614,0.038896,True,0.693254,0.702437,0.695238,0.693147,0.696019,0.498566,train


In [None]:
target = "SOH_proxy"


In [None]:
feature_cols = [
    "V_mean",
    "V_std",
    "V_min",
    "V_max",
    "V_range",
    "dV_dt_mean",
    "dV_dt_max",
    "T_mean",
    "T_max",
    "T_delta",
    "duration_s",
]


In [None]:
train_df = df[df["split"] == "train"]
val_df   = df[df["split"] == "val"]
test_df  = df[df["split"] == "test"]

X_train = train_df[feature_cols]
y_train = train_df[target]

X_val = val_df[feature_cols]
y_val = val_df[target]

X_test = test_df[feature_cols]
y_test = test_df[target]


In [None]:
print(df.columns.tolist())


['asset_id', 'cycle_id', 'V_mean', 'V_std', 'V_min', 'V_max', 'V_range', 'dV_dt_mean', 'dV_dt_max', 'T_mean', 'T_max', 'T_delta', 'Q_total', 'duration_s', 'source', 'V_range_base', 'dV_dt_base', 'T_delta_base', 'duration_base', 'use_dV_dt', 'V_range_norm', 'T_delta_norm', 'duration_norm', 'dV_dt_norm', 'Degradation_Index', 'SOH_proxy', 'split']


In [None]:
X_train = X_train.copy()
X_val   = X_val.copy()
X_test  = X_test.copy()

# Leave NaNs as-is (XGBoost / LightGBM can handle them)
print(X_train.isna().sum())


V_mean            0
V_std         10784
V_min             0
V_max             0
V_range           0
dV_dt_mean    10784
dV_dt_max     10784
T_mean            0
T_max             0
T_delta           0
duration_s        0
dtype: int64


In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def eval_split(name, X, y):
    preds = model.predict(X)
    return {
        "split": name,
        "MAE": mean_absolute_error(y, preds),
        "RMSE": np.sqrt(mean_squared_error(y, preds)),
        "R2": r2_score(y, preds),
    }

results = [
    eval_split("train", X_train, y_train),
    eval_split("val",   X_val,   y_val),
    eval_split("test",  X_test,  y_test),
]

pd.DataFrame(results)


Unnamed: 0,split,MAE,RMSE,R2
0,train,0.003105,0.005667,0.999523
1,val,0.003519,0.008216,0.999053
2,test,0.0035,0.008314,0.998934


In [None]:
import pandas as pd

fi = pd.DataFrame({
    "feature": feature_cols,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

fi


Unnamed: 0,feature,importance
10,duration_s,0.461861
9,T_delta,0.23588
0,V_mean,0.151821
3,V_max,0.0663
5,dV_dt_mean,0.026238
4,V_range,0.024718
6,dV_dt_max,0.015968
2,V_min,0.008357
8,T_max,0.004311
1,V_std,0.004085


In [None]:
import joblib

joblib.dump(model, "baseline_xgb_soh_model.joblib")


['baseline_xgb_soh_model.joblib']