In [74]:
# Importing the functions
from lib.a_load import loader
from lib.b_clean_nan import nan_dropper
from lib.c_fill_means import nan_filler
from lib.d_transformer import *
from lib.ef_model import model

import sklearn
from sklearn.ensemble import RandomForestClassifier

## 1. Load and split the data

In [75]:
df = loader("sample_diabetes_mellitus_data.csv").load()
df

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,214826,118,68.0,22.732803,0,Caucasian,M,180.30,Floor,Floor,...,0,14.1,0,0,0,0,0,0,0,1
1,246060,81,77.0,27.421875,0,Caucasian,F,160.00,Floor,Floor,...,1,12.7,0,0,0,0,0,0,0,1
2,276985,118,25.0,31.952749,0,Caucasian,F,172.70,Emergency Department,Accident & Emergency,...,0,,0,0,0,0,0,0,0,0
3,262220,118,81.0,22.635548,1,Caucasian,F,165.10,Operating Room,Operating Room / Recovery,...,1,8.0,0,0,0,0,0,0,0,0
4,201746,33,19.0,,0,Caucasian,M,188.00,,Accident & Emergency,...,0,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,167042,140,,51.952814,0,Caucasian,F,137.20,Floor,Floor,...,0,7.0,0,0,0,0,0,0,0,0
9996,165009,89,34.0,26.110113,0,Caucasian,F,175.26,Emergency Department,Accident & Emergency,...,0,9.5,0,0,0,0,0,0,0,0
9997,234052,137,30.0,23.774241,0,Native American,F,157.48,Emergency Department,Accident & Emergency,...,0,,0,0,0,0,0,0,0,0
9998,249290,89,67.0,31.330708,0,Caucasian,F,157.48,Direct Admit,Accident & Emergency,...,0,6.4,0,0,0,0,0,0,0,0


In [76]:
train, test = loader("sample_diabetes_mellitus_data.csv").split()
print(f'{train.shape, test.shape}')

((7000, 52), (3000, 52))


## 2. Dropping missing values

In [77]:
train = nan_dropper(train).drop_nan(['age', 'gender', 'ethnicity'])
test = nan_dropper(test).drop_nan(['age', 'gender', 'ethnicity'])

## 3. Filling missing values

In [78]:
train = nan_filler(train).fill_means(['height', 'weight'])
test = nan_filler(test).fill_means(['height', 'weight'])

  self.df.loc[:,col].fillna(self.df.mean().iloc[0], inplace=True)
  self.df.loc[:,col].fillna(self.df.mean().iloc[0], inplace=True)


In [79]:
train.head()

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
3194,220344,118,87.0,18.820156,0,Caucasian,M,165.1,Emergency Department,Accident & Emergency,...,1,10.0,0,0,0,0,0,0,0,0
1507,226441,118,32.0,20.859034,0,Caucasian,M,175.3,Operating Room,Other Hospital,...,1,33.9,0,0,0,0,0,0,0,0
3438,162338,81,44.0,53.515625,1,Hispanic,F,160.0,Operating Room,Operating Room / Recovery,...,1,13.7,0,0,0,0,0,0,0,1
9707,249592,140,74.0,26.615539,0,Caucasian,F,170.2,Emergency Department,Accident & Emergency,...,0,,0,0,0,0,0,0,0,0
9547,184294,140,86.0,26.065657,0,Caucasian,M,175.3,Emergency Department,Accident & Emergency,...,0,,0,0,0,0,0,0,0,0


## 4. Transforming the data

In [80]:
# Consider the following snippet is replacing the data in the dataframe passed to the function
# If you declare these to a variable, it will end up being a none type
# E.g. train = log(train, ['age', 'bmi']).apply_transform() will result in a none type

log(train, ['wbc_apache', 'ventilated_apache']).apply_transform()
log(train, ['wbc_apache', 'ventilated_apache']).apply_transform()


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [81]:
train.head()

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
3194,220344,118,87.0,18.820156,0,Caucasian,M,165.1,Emergency Department,Accident & Emergency,...,-inf,0.0,0,0,0,0,0,0,0,0
1507,226441,118,32.0,20.859034,0,Caucasian,M,175.3,Operating Room,Other Hospital,...,-inf,0.184748,0,0,0,0,0,0,0,0
3438,162338,81,44.0,53.515625,1,Hispanic,F,160.0,Operating Room,Operating Room / Recovery,...,-inf,0.055654,0,0,0,0,0,0,0,1
9707,249592,140,74.0,26.615539,0,Caucasian,F,170.2,Emergency Department,Accident & Emergency,...,,,0,0,0,0,0,0,0,0
9547,184294,140,86.0,26.065657,0,Caucasian,M,175.3,Emergency Department,Accident & Emergency,...,,,0,0,0,0,0,0,0,0


In [82]:
# Consider the following snippet is replacing the data in the dataframe passed to the function
# If you declare these to a variable, it will end up being a none type
# E.g. train = levels(train, ['age', 'height']).apply_transform() will result in a none type

levels(train, ['wbc_apache', 'ventilated_apache']).apply_transform()
levels(test, ['wbc_apache', 'ventilated_apache']).apply_transform()

In [83]:
train.describe()

Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,weight,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
count,6553.0,6553.0,6553.0,5404.0,6553.0,6469.0,6553.0,6553.0,6553.0,5447.0,...,2022.0,5127.0,6553.0,6553.0,6553.0,6553.0,6553.0,6553.0,6553.0,6553.0
mean,212342.540211,103.529223,62.435678,30.018643,0.229666,170.159028,105.280787,0.634027,0.0,87.10278,...,0.0,1.00553,0.000305,0.019228,0.013429,0.046086,0.006867,0.005036,0.026553,0.236228
std,37906.369726,30.587139,16.62812,8.442535,0.42065,10.728466,16.972934,2.163882,0.0,25.760174,...,0.0,0.236125,0.017469,0.137335,0.115111,0.209687,0.082589,0.07079,0.160784,0.424796
min,147009.0,4.0,16.0,14.844926,0.0,137.2,82.0,-0.224306,0.0,38.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179371.0,83.0,53.0,24.238815,0.0,162.6,92.0,0.002083,0.0,68.8,...,0.0,0.857332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,212202.0,118.0,64.0,28.424843,0.0,170.2,99.0,0.010417,0.0,83.2,...,0.0,0.995635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,244881.0,118.0,75.0,34.074977,0.0,177.8,114.0,0.147222,0.0,101.05,...,0.0,1.167317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,279000.0,198.0,89.0,67.81499,1.0,195.59,171.0,49.523611,0.0,186.0,...,0.0,1.660865,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 5. Model

In [84]:
rf = RandomForestClassifier()

features = ['hepatic_failure', 'height', 'weight', 'elective_surgery', 'immunosuppression', 'leukemia','age','bmi']
target = ['diabetes_mellitus']

train = nan_dropper(train).drop_nan(features)
test = nan_dropper(test).drop_nan(features)

randomForest = model(train, rf, features, target)

  self.model_fit = self.model.fit(X_train, y_train)


In [88]:
test['prediction'] = randomForest.predict(test)[:,1]

In [91]:
# Last two columns of test dataframe
test.iloc[:,-2:]

Unnamed: 0,diabetes_mellitus,prediction
7878,0,0.11
3224,0,0.78
1919,0,0.05
4835,0,0.33
4895,0,0.08
...,...,...
9613,0,0.69
5268,0,0.04
6772,0,0.20
4048,0,0.13


## 6. Model evaluation (MISSING)

In [None]:
# ROC AUC score