In [8]:
from model import LinearModel, LogModel
from feature_transforming import AgeTransformer, GenderTransformer, EthnicityTransformer
from data_processing import NanFiller, NanDropper, DataLoader

from sklearn.metrics import roc_auc_score

train, test = DataLoader(path="data/data.csv").get_data(test_size=0.3, random_state=42)

train.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
9069,9069,234499,89,41.0,31.015625,0,Caucasian,F,160.0,Direct Admit,...,1,26.0,0,0,0,1,0,0,0,0
2603,2603,250833,118,66.0,,0,Caucasian,M,,Floor,...,0,6.6,0,0,0,0,0,0,0,0
7738,7738,202291,83,80.0,30.052518,1,Caucasian,M,185.4,,...,0,5.7,0,0,0,0,0,0,0,0
1579,1579,225216,118,70.0,23.436279,1,Caucasian,M,182.9,Operating Room,...,0,7.2,0,0,0,0,0,0,0,0
5058,5058,224030,83,46.0,,0,Caucasian,F,172.7,Direct Admit,...,0,,0,0,0,0,0,0,0,0


In [9]:
drop_na_processor = NanDropper(columns=["age", "gender", "ethnicity"])
train = drop_na_processor.process(train)
test = drop_na_processor.process(test)

fill_mean_processor = NanFiller(columns=["height", "bmi"])
train = fill_mean_processor.process(train)
test = fill_mean_processor.process(test)

In [10]:
gender_transformer = GenderTransformer()
age_transformer = AgeTransformer()
ethnicity_transformer = EthnicityTransformer()

train_data_transformed = gender_transformer.transform(train)
train_data_transformed = age_transformer.transform(train_data_transformed)
train_data_transformed = ethnicity_transformer.transform(train_data_transformed)

test_data_transformed = gender_transformer.transform(test)
test_data_transformed = age_transformer.transform(test_data_transformed)
test_data_transformed = ethnicity_transformer.transform(test_data_transformed)

In [11]:
feature_columns = ["age", "bmi", "gender_numeric", "age_category", "ethnicity_group"]
target_column = "diabetes_mellitus"

model = LogModel(feature_columns=feature_columns, target_column=target_column)
model.train(train_data_transformed)

In [12]:
test_probabilities = model.predict(test_data_transformed)

print(test_probabilities)

test_data_transformed["predictions"] = test_probabilities

y_true = test_data_transformed[target_column]
roc_auc = roc_auc_score(y_true, test_probabilities)

print(f"ROC AUC: {roc_auc}")

[0.32382446 0.25431554 0.1350916  ... 0.27008833 0.20330619 0.48219773]
ROC AUC: 0.6709425865428702


In [13]:
feature_columns = ["age", "bmi", "gender_numeric", "age_category", "ethnicity_group"]
target_column = "diabetes_mellitus"

model = LinearModel(feature_columns=feature_columns, target_column=target_column)
model.train(train_data_transformed)

In [14]:
test_probabilities = model.predict(test_data_transformed)

print(test_probabilities)

test_data_transformed["predictions"] = test_probabilities

y_true = test_data_transformed[target_column]
roc_auc = roc_auc_score(y_true, test_probabilities)

print(f"ROC AUC: {roc_auc}")

[0.32709779 0.26019847 0.12758412 ... 0.28960227 0.20792064 0.4531858 ]
ROC AUC: 0.6694151908528199
