In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score


In [12]:
df = pd.read_excel("CSI_7_MAL_2526_Data.xlsx")

# Keep only rows where severity exists
df = df.dropna(subset=["Severity Score"])

pat_id = df["PatID"]                 # grouping variable for GroupKFold
y = df["Severity Score"].astype(int) # 0..5
X = df.drop(columns=["PatID", "Severity Score"])


In [3]:
y_bin = (y >= 3).astype(int)

In [4]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=5000))
])


In [5]:
gkf = GroupKFold(n_splits=5)

acc_scores = []
sens_scores = []
spec_scores = []
auc_scores = []

for train_idx, test_idx in gkf.split(X, y_bin, groups=pat_id):
    X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
    y_tr, y_te = y_bin.iloc[train_idx], y_bin.iloc[test_idx]

    pipe.fit(X_tr, y_tr)                      # TRAIN
    prob = pipe.predict_proba(X_te)[:, 1]     # probability of class 1 (inflamed)
    pred = (prob >= 0.5).astype(int)          # convert probability to 0/1


In [6]:
    # Accuracy
    acc_scores.append(accuracy_score(y_te, pred))

    # Confusion matrix -> tn, fp, fn, tp
    tn, fp, fn, tp = confusion_matrix(y_te, pred).ravel()

    # Sensitivity (TPR)
    sens = tp / (tp + fn) if (tp + fn) else np.nan
    sens_scores.append(sens)

    # Specificity (TNR)
    spec = tn / (tn + fp) if (tn + fp) else np.nan
    spec_scores.append(spec)

    # ROC-AUC uses probabilities, not hard labels
    auc_scores.append(roc_auc_score(y_te, prob))


In [7]:
print("Logistic Regression (Binary <3 vs ≥3) with GroupKFold")
print(f"Accuracy:    {np.mean(acc_scores):.3f} ± {np.std(acc_scores):.3f}")
print(f"Sensitivity: {np.mean(sens_scores):.3f} ± {np.std(sens_scores):.3f}")
print(f"Specificity: {np.mean(spec_scores):.3f} ± {np.std(spec_scores):.3f}")
print(f"ROC-AUC:     {np.mean(auc_scores):.3f} ± {np.std(auc_scores):.3f}")


Logistic Regression (Binary <3 vs ≥3) with GroupKFold
Accuracy:    0.919 ± 0.000
Sensitivity: 0.912 ± 0.000
Specificity: 0.929 ± 0.000
ROC-AUC:     0.985 ± 0.000


In [9]:
y_bin.value_counts(normalize=True)


Severity Score
1    0.617647
0    0.382353
Name: proportion, dtype: float64