Setup (paths, imports, seed)

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import numpy as np
import pandas as pd

from src.config import get_paths, set_seed
from src.io import read_table, save_table
from src.modeling import (
    eval_stratifiedkfold_multiclass_lr,
    eval_stratifiedkfold_multiclass_lgbm,
    eval_kfold_regression_elasticnet,
    eval_kfold_regression_lgbm,
)

paths = get_paths()
set_seed(42)

LABELS_ORDER = ["GC", "G2", "G5", "G7", "G14"]

Load processed datasets

In [2]:
X = pd.read_csv(paths.processed / "X_radiomics.csv")
y_group = pd.read_csv(paths.processed / "y_group.csv")["group"]
y_days = pd.read_csv(paths.processed / "y_days.csv")["days_post_btx"]

# Safety checks
assert len(X) == len(y_group) == len(y_days), "X/y size mismatch"
assert X.shape[1] > 0, "X has no features"

print("Loaded datasets:")
print(f"- X_radiomics: {X.shape}")
print(f"- y_group:     {y_group.shape} | unique={sorted(y_group.unique().tolist())}")
print(f"- y_days:      {y_days.shape}  | min={y_days.min()} max={y_days.max()}")

Loaded datasets:
- X_radiomics: (571, 105)
- y_group:     (571,) | unique=['G14', 'G2', 'G5', 'G7', 'GC']
- y_days:      (571,)  | min=0 max=14


ERAT classification: Logistic Regression (multinomial)

In [4]:
df_folds_lr, summary_lr, cm_lr, oof_lr = eval_stratifiedkfold_multiclass_lr(
    X=X,
    y=y_group,
    labels_order=LABELS_ORDER,
    n_splits=5,
    seed=42,
)

display(df_folds_lr)
display(summary_lr)
display(cm_lr)
oof_lr.head()

Unnamed: 0,fold,n_test,balanced_accuracy,weighted_kappa_quadratic,mae_days
0,1,115,0.745606,0.825624,1.173913
1,2,114,0.802273,0.909654,0.675439
2,3,114,0.833333,0.852771,0.526316
3,4,114,0.831818,0.87914,0.596491
4,5,114,0.823485,0.811837,0.631579


Unnamed: 0,metric,mean,std
0,balanced_accuracy,0.807303,0.036648
1,weighted_kappa_quadratic,0.855805,0.039678
2,mae_days,0.720748,0.259136


Unnamed: 0,GC,G2,G5,G7,G14
GC,103,8,3,6,0
G2,7,80,17,6,0
G5,9,3,104,5,0
G7,9,15,21,65,0
G14,0,0,0,0,110


Unnamed: 0,y_true,y_pred
0,GC,GC
1,GC,GC
2,GC,GC
3,GC,GC
4,GC,GC


In [5]:
save_table(oof_lr, paths.results / "tables" / "erat_cls_lr_oof_predictions.csv")

WindowsPath('c:/Users/modre/Documents/masseter/results/tables/erat_cls_lr_oof_predictions.csv')

ERAT classification: LightGBM

In [6]:
df_folds_lgbm, summary_lgbm, cm_lgbm, oof_lgbm = eval_stratifiedkfold_multiclass_lgbm(
    X=X,
    y=y_group,
    labels_order=LABELS_ORDER,
    n_splits=5,
    seed=42,
    n_estimators=500,
)

display(df_folds_lgbm)
display(summary_lgbm)
display(cm_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15349
[LightGBM] [Info] Number of data points in the train set: 456, number of used features: 104
[LightGBM] [Info] Start training from score -1.645156
[LightGBM] [Info] Start training from score -1.645156
[LightGBM] [Info] Start training from score -1.558145
[LightGBM] [Info] Start training from score -1.645156
[LightGBM] [Info] Start training from score -1.558145
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000811 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15432
[LightGBM] [Info] Number of data points in the train set: 457, number of used features: 104
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000722 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15441
[LightGBM] [Info] Number of data points in the train set: 457, number of used features: 104
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.549972
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.560335








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15439
[LightGBM] [Info] Number of data points in the train set: 457, number of used features: 104
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.549972
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.560335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15440
[LightGBM] [Info] Number of data points in the train set: 457, number of used features: 104
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score -1.647347
[LightGBM] [Info] Start training from score 







ValueError: not enough values to unpack (expected 4, got 3)