In [133]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [134]:
path_csv = '/home/paul/workspace/postdoc-year1/projects/calprotectin/data/Sepsis_Validation_FINAL DATASET_110925_SH.xlsx'
df = pd.read_excel(path_csv)
# drop the one NaN from IL6
# df_cleaned = df.dropna(subset=['IL6_value_1', 'fever_sub_partu', 'b_streptococcus']).copy()
# df_cleaned.shape

In [135]:
df.shape

(355, 58)

In [136]:
keys = ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'birth_mode', 'umbilical_cord_ph',
       'capillary_time', 'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'antibiotic_therapy', 'antibiotic_therapy_duration',
       'diagnosis_infection', 'result_blood_culture', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'membrane_rupture_hours', 'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus', 'crp_max_prepartal',
       'leukocyte_max_prepartal', 'fever_sub_partu', 'antibiotics_prepartal',
       'crp_time_1', 'crp_value_1', 'crp_time_2', 'crp_value_2', 'crp_time_3',
       'crp_value_3', 'crp_time_4', 'crp_value_4', 'crp_time_5', 'crp_value_5',
       'IL6_time_1', 'IL6_value_1', 'Cal_time_1', 'Cal_value_1', 'Cal_time_2',
       'Cal_value_2', 'Cal_time_3', 'Cal_value_3', 'Cal_time_4', 'Cal_value_4',
       'Cal_time_5', 'Cal_value_5']

In [137]:
null_counts = df[keys].isnull().sum()

# Filter to show only columns with missing values
# null_counts = null_counts[null_counts > 0]

print("Columns with missing values:")
print(null_counts)

Columns with missing values:
birth_weight                     0
gest_weeks                       1
age_mother                       0
gravidity                        3
parity                           2
birth_mode                       0
umbilical_cord_ph                3
capillary_time                  12
o2_demand                        0
breath_aid                       0
heart_rate                       1
respiration_rate                 5
rr_systolic                      4
rr_diastolic                     4
base_excess                     17
ph_value                        13
antibiotic_therapy               0
antibiotic_therapy_duration    115
diagnosis_infection              0
result_blood_culture            29
gestation_diabetes               2
diabetes_type_1_2                2
adiposity                        2
early_membrane_rupture           2
membrane_rupture_hours         227
early_labor_pain                 3
green_amniotic_liquor            3
b_streptococcus           

In [138]:
keys_cleaned = ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph',
       'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'antibiotic_therapy',
       'diagnosis_infection', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus',
       'fever_sub_partu', 'antibiotics_prepartal',
       'crp_value_1', 'IL6_time_1', 'IL6_value_1', 'Cal_value_1']

In [139]:
df_cleaned = df.dropna(subset=keys_cleaned).copy()
df_cleaned.shape

(324, 58)

In [140]:
features_clinical = ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph',
       'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus',
       'fever_sub_partu', 'antibiotics_prepartal']
features_biomarkers_classic = ['crp_value_1', 'IL6_value_1']
feature_cal = ['Cal_value_1']
target = 'diagnosis_infection'
all_features = features_clinical + feature_cal

In [141]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42, stratify=df_cleaned[target])
X_train = train_df[all_features]
y_train = train_df[target]
X_test = test_df[all_features]
y_test = test_df[target]

In [154]:
preprocessor = StandardScaler()

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=32, max_depth=6, learning_rate=1, objective='binary:logistic'))
    # ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))
])
# model = XGBClassifier(n_estimators=32, max_depth=2, learning_rate=1, objective='binary:logistic')

In [155]:
model_pipeline.fit(X_train, y_train)

# Evaluate on the unseen TEST set
y_pred_test = model_pipeline.predict(X_test)
y_pred_proba_test = model_pipeline.predict_proba(X_test)[:, 1]

print("\nPerformance on the Test Set (Expanded Features):")
print(f"Test Set ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_test):.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))


Performance on the Test Set (Expanded Features):
Test Set ROC AUC Score: 0.8222

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.79      0.91      0.85        45
           1       0.69      0.45      0.55        20

    accuracy                           0.77        65
   macro avg       0.74      0.68      0.70        65
weighted avg       0.76      0.77      0.75        65



In [144]:
# sorted_idx = model_pipeline.named_steps['classifier'].feature_importances_.argsort()
# plt.barh(all_features, model_pipeline.named_steps['classifier'].feature_importances_[sorted_idx])

In [145]:
coefficients = pd.DataFrame(
    model_pipeline.named_steps['classifier'].coef_[0], 
    index=all_features, 
    columns=['Coefficient']
)
coefficients.sort_values('Coefficient', ascending=False, inplace=True)

print("\n--- Expanded Model Feature Importance ---")
print(coefficients)

AttributeError: Coefficients are not defined for Booster type None

In [None]:
print("Confusion Matrix (Test Set):")
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap='Blues')
plt.show()

Confusion Matrix (Test Set):


ValueError: feature_names mismatch: ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph', 'o2_demand', 'breath_aid', 'heart_rate', 'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess', 'ph_value', 'gestation_diabetes', 'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture', 'early_labor_pain', 'green_amniotic_liquor', 'b_streptococcus', 'fever_sub_partu', 'antibiotics_prepartal', 'crp_value_1', 'IL6_value_1', 'Cal_value_1'] ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph', 'o2_demand', 'breath_aid', 'heart_rate', 'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess', 'ph_value', 'gestation_diabetes', 'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture', 'early_labor_pain', 'green_amniotic_liquor', 'b_streptococcus', 'fever_sub_partu', 'antibiotics_prepartal', 'Cal_value_1']
expected IL6_value_1, crp_value_1 in input data