In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from src.data_loader import DataLoader
from src.preprocessor_dropna import DropNaPreprocessor
from src.preprocessor_fillmean import FillMeanPreprocessor
from src.features import BMIFeature, AgeGroupFeature
from src.model import Model

In [8]:
#load and split data
loader = DataLoader("data/sample_diabetes_mellitus_data.csv")
train_df, test_df = loader.split_data()
print("After split:", type(train_df), type(test_df))




After split: <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [None]:
#preprocessing
dropper = DropNaPreprocessor()
train_df = dropper.process(train_df)
test_df = dropper.process(test_df)

filler = FillMeanPreprocessor()
train_df = filler.process(train_df)
test_df = filler.process(test_df)

print("After preprocessing:", type(train_df), type(test_df))



After preprocessing: <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [10]:
dropper = DropNaPreprocessor()
train_df = dropper.process(train_df)
test_df = dropper.process(test_df)

filler = FillMeanPreprocessor()
train_df = filler.process(train_df)
test_df = filler.process(test_df)

In [None]:
#creating features
bmi_feat = BMIFeature()
age_feat = AgeGroupFeature()

train_df = bmi_feat.transform(train_df)
train_df = age_feat.transform(train_df)
test_df = bmi_feat.transform(test_df)
test_df = age_feat.transform(test_df)

print("Columns now:", train_df.columns.tolist())


Columns now: ['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'diabetes_mellitus', 'BMI', 'age_group']


In [12]:
#train and predict
feature_cols = ["BMI", "age"]
target_col = "diabetes_mellitus"  

model = Model(feature_cols=feature_cols, target_col=target_col, max_iter=1000)
model.train(train_df)

test_df["predictions"] = model.predict(test_df)
print(test_df[["BMI", "age", "predictions"]].head())


            BMI   age  predictions
6252  34.852863  66.0     0.299216
1731  28.215375  83.0     0.283543
4742  21.575502  64.0     0.142086
4521  34.560280  21.0     0.134153
6340  40.374334  66.0     0.383298


In [13]:
#roc and auc
auc = roc_auc_score(test_df[target_col], test_df["predictions"])
print("ROC-AUC score:", auc)


ROC-AUC score: 0.6531516057375493
