In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("canonical_checkpoint_features.parquet")

df = df.sort_values(["asset_id", "cycle_id"]).reset_index(drop=True)

df.head()


Unnamed: 0,asset_id,cycle_id,V_mean,V_std,V_min,V_max,V_range,dV_dt_mean,dV_dt_max,T_mean,T_max,T_delta,Q_total,duration_s,source
0,0,0,3.735076,0.245422,2.699819,4.191235,1.491416,-35.948592,38.111247,40.41934,41.174809,1.273163,,0.04174,oxford
1,0,1,3.735681,0.244972,2.699859,4.192679,1.492819,-36.342517,34.65244,40.327115,41.124866,1.323108,,0.041245,oxford
2,0,10,3.739525,0.240002,2.699924,4.192582,1.492658,-38.02239,41.584546,40.363359,41.174809,1.335576,,0.039406,oxford
3,0,11,3.739852,0.239651,2.699803,4.192502,1.492699,-38.328526,34.646588,40.433203,41.262188,1.385513,,0.03923,oxford
4,0,12,3.740231,0.238893,2.699924,4.192462,1.492538,-38.309224,34.655228,40.39325,41.212246,1.360546,,0.03906,oxford


In [None]:
BASELINE_FRAC = 0.15

def compute_baseline(group):
    n = len(group)
    k = max(5, int(n * BASELINE_FRAC))
    base = group.iloc[:k]
    return pd.Series({
        "V_range_base": base["V_range"].median(),
        "dV_dt_base": base["dV_dt_max"].median(),
        "T_delta_base": base["T_delta"].median(),
        "duration_base": base["duration_s"].median(),
    })

baselines = (
    df.groupby("asset_id", group_keys=False)
      .apply(compute_baseline)
      .reset_index()
)

baselines


  .apply(compute_baseline)


Unnamed: 0,asset_id,V_range_base,dV_dt_base,T_delta_base,duration_base
0,0,1.492217,34.655228,1.335614,0.038896
1,1,1.492066,38.088114,1.372467,0.038845
2,1.RFUD_RW9_10_11_12,0.177,0.0,0.52365,117.22
3,2,1.49232,38.111247,0.923679,0.039099
4,3,1.492073,34.663868,1.348118,0.038701
5,4,1.492678,36.340037,1.514893,0.039107
6,5,1.492312,38.128527,1.541851,0.03891
7,6,1.493034,38.085327,1.497013,0.038971
8,7,1.492422,38.093967,1.709602,0.038722
9,RFUD_RW1_2_7_8,1.0,0.852,7.17962,1548.0


In [None]:
df = df.merge(baselines, on="asset_id", how="left")

df["use_dV_dt"] = df["dV_dt_base"] > 0

df[["asset_id", "use_dV_dt"]].drop_duplicates()


Unnamed: 0,asset_id,use_dV_dt
0,0,True
78,1,True
151,1.RFUD_RW9_10_11_12,False
111470,2,True
111546,3,True
111593,4,True
111639,5,True
111685,6,True
111762,7,True
111838,RFUD_RW1_2_7_8,True


In [None]:
EPS = 1e-6

df["V_range_norm"] = np.log1p(df["V_range"] / (df["V_range_base"] + EPS))
df["T_delta_norm"] = np.log1p(df["T_delta"] / (df["T_delta_base"] + EPS))
df["duration_norm"] = np.log1p(df["duration_s"] / (df["duration_base"] + EPS))

df["dV_dt_norm"] = np.where(
    df["use_dV_dt"],
    np.log1p(df["dV_dt_max"] / (df["dV_dt_base"] + EPS)),
    np.nan
)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
norm_cols = [
    "V_range_norm",
    "T_delta_norm",
    "duration_norm",
    "dV_dt_norm"
]

df["Degradation_Index"] = df[norm_cols].mean(axis=1, skipna=True)

df["SOH_proxy"] = np.exp(-df["Degradation_Index"])
df["SOH_proxy"] = df["SOH_proxy"].clip(0.2, 1.0)


In [None]:
df["SOH_proxy"].describe()


Unnamed: 0,SOH_proxy
count,118770.0
mean,0.573955
std,0.260061
min,0.2
25%,0.367153
50%,0.476687
75%,0.83921
max,1.0


In [None]:
df[["SOH_proxy", "V_range", "T_delta", "duration_s"]].corr()


Unnamed: 0,SOH_proxy,V_range,T_delta,duration_s
SOH_proxy,1.0,-0.486502,-0.284929,-0.04676
V_range,-0.486502,1.0,0.628633,0.092278
T_delta,-0.284929,0.628633,1.0,0.146448
duration_s,-0.04676,0.092278,0.146448,1.0


In [None]:
df.groupby("asset_id").apply(
    lambda g: g["SOH_proxy"].corr(
        pd.Series(range(len(g)), index=g.index)
    )
).describe()


  df.groupby("asset_id").apply(


Unnamed: 0,0
count,11.0
mean,0.177278
std,0.173675
min,-0.11441
25%,0.055475
50%,0.122636
75%,0.34686
max,0.417518


In [None]:
df.to_parquet(
    "canonical_checkpoint_features_with_soh.parquet",
    index=False
)


In [None]:
/content/canonical_checkpoint_features_with_soh.parquet