In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
path_csv = '/home/paul/workspace/postdoc-year1/projects/calprotectin/data/Sepsis_Validation_FINAL DATASET_110925_SH.xlsx'
df = pd.read_excel(path_csv)
# drop the one NaN from IL6
df_cleaned = df.dropna(subset=['IL6_value_1', 'fever_sub_partu', 'b_streptococcus']).copy()
df_cleaned.shape

(351, 58)

In [None]:

# --- 2. Define the new, expanded feature set ---
BIOMARKER_FEATURES = ['crp_value_1', 'IL6_value_1', 'Cal_value_1']
RISK_FEATURES = [
    'fever_sub_partu',
    'b_streptococcus',
    'breath_aid' # Example from the clinical status features
]
ALL_FEATURES = BIOMARKER_FEATURES + RISK_FEATURES
TARGET = 'diagnosis_infection'


In [4]:
# Let's re-create them here for a complete example.
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df_cleaned, test_size=0.2, random_state=42, stratify=df_cleaned[TARGET])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df[TARGET])

X_train = train_df[ALL_FEATURES]
y_train = train_df[TARGET]
X_test = test_df[ALL_FEATURES]
y_test = test_df[TARGET]

In [5]:

# Since all our new features are numeric or binary (0/1), StandardScaler is sufficient.
# If you add categorical features like `birth_mode`, you'll need OneHotEncoder.

# Define which columns need scaling
# numeric_features = ['crp_value_1', 'IL6_value_1', 'Cal_value_1', 
#                     'gestational_age_weeks', 'membrane_rupture_hours']

# The binary features don't strictly need scaling, but it doesn't hurt.
# We'll scale all of them for simplicity here.
preprocessor = StandardScaler()

# --- 4. Create the full model pipeline ---
# A pipeline bundles preprocessing and the model into one object. This is best practice!
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))
])

In [6]:
model_pipeline.fit(X_train, y_train)

# Evaluate on the unseen TEST set
y_pred_test = model_pipeline.predict(X_test)
y_pred_proba_test = model_pipeline.predict_proba(X_test)[:, 1]

print("\nPerformance on the Test Set (Expanded Features):")
print(f"Test Set ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_test):.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


Performance on the Test Set (Expanded Features):
Test Set ROC AUC Score: 0.7418

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.77      0.80      0.78        25
           1       0.50      0.45      0.48        11

    accuracy                           0.69        36
   macro avg       0.63      0.63      0.63        36
weighted avg       0.69      0.69      0.69        36



In [7]:
coefficients = pd.DataFrame(
    model_pipeline.named_steps['classifier'].coef_[0], 
    index=ALL_FEATURES, 
    columns=['Coefficient']
)
coefficients.sort_values('Coefficient', ascending=False, inplace=True)

print("\n--- Expanded Model Feature Importance ---")
print(coefficients)


--- Expanded Model Feature Importance ---
                 Coefficient
IL6_value_1         3.134927
crp_value_1         1.140759
breath_aid          0.785831
Cal_value_1         0.368222
b_streptococcus     0.272385
fever_sub_partu    -0.011152
