In [None]:
# Load libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

# Read TrainingWiDS2021 data and UnlabeledWiDS2021 data into pandas in two DataFrames

url_train = 'https://github.com/areeratk/Diabetes_Mellitus/blob/main/Resources/TrainingWiDS2021.csv?raw=true'
url_test = 'https://github.com/areeratk/Diabetes_Mellitus/blob/main/Resources/UnlabeledWiDS2021.csv?raw=true'
df = pd.read_csv(url_train)
test_df = pd.read_csv(url_test)

#Print the worksheet dataframe, df
print(df.shape)

#Print the predicting dataframe, test_df
print(test_df.shape)

# Encode string variables as new columns
df.loc[:, 'ethnicity_encode'] = pd.factorize(df['ethnicity'])[0].reshape(-1,1)
df.loc[:, 'gender_encode'] = pd.factorize(df['gender'])[0].reshape(-1,1)
df.loc[:, 'hospital_admit_source_encode'] = pd.factorize(df['hospital_admit_source'])[0].reshape(-1,1)
df.loc[:, 'icu_admit_source_encode'] = pd.factorize(df['icu_admit_source'])[0].reshape(-1,1)
df.loc[:, 'icu_stay_type_encode'] = pd.factorize(df['icu_stay_type'])[0].reshape(-1,1)
df.loc[:, 'icu_type_encode'] = pd.factorize(df['icu_type'])[0].reshape(-1,1)

# Create new variables to solve missing data and high correlation among predictors issues
df['d1_diasbp_invasive_diff'] = df['d1_diasbp_invasive_max'] - df['d1_diasbp_invasive_min']
df['d1_diasbp_diff'] = df['d1_diasbp_max'] - df['d1_diasbp_min']
df['d1_heartrate_diff'] = df['d1_heartrate_max'] - df['d1_heartrate_min']
df['d1_mbp_diff'] = df['d1_mbp_max'] - df['d1_mbp_min']
df['d1_spo2_diff'] = df['d1_spo2_max'] - df['d1_spo2_min']
df['d1_sysbp_diff'] = df['d1_sysbp_max'] - df['d1_sysbp_min']
df['d1_calcium_diff'] = df['d1_calcium_max'] - df['d1_calcium_min']
df['d1_inr_diff'] = df['d1_inr_max'] - df['d1_inr_min']
df['d1_temp_diff'] = df['d1_temp_max'] - df['d1_temp_min']
df['h1_diasbp_diff'] = df['h1_diasbp_max'] - df['h1_diasbp_min']
df['h1_heartrate_diff'] = df['h1_heartrate_max'] - df['h1_heartrate_min']
df['h1_mbp_diff'] = df['h1_mbp_max'] - df['h1_mbp_min']
df['h1_resprate_diff'] = df['h1_resprate_max'] - df['h1_resprate_min']
df['h1_spo2_diff'] = df['h1_spo2_max'] - df['h1_spo2_min']
df['h1_sysbp_diff'] = df['h1_sysbp_max'] - df['h1_sysbp_min']
df['gcs_sum']=df['gcs_eyes_apache']+df['gcs_motor_apache']+df['gcs_verbal_apache']
df['bmi_calculated'] = 10000 * df['weight']/(df['height']**2)

# Drop columns of the original variables after columns of further relevant and related variables are created
df = df.drop(['d1_diasbp_invasive_max', 'd1_diasbp_invasive_min', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_spo2_max', 'd1_spo2_min',
              'd1_sysbp_max', 'd1_sysbp_min', 'd1_calcium_max', 'd1_calcium_min', 'd1_inr_max', 'd1_inr_min', 'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min',
              'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max', 'h1_mbp_min', 'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_max', 'h1_sysbp_min', 
              'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_verbal_apache', 'ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'bmi'
              ], axis=1)

# Impute remaining missing value with zeroes
df = df.fillna(0)

#Prepare the predicting dataframe, testing_df, following the same procedure describe in the immidiate cell above

# Encode string variables as new columns
test_df.loc[:, 'ethnicity_encode'] = pd.factorize(test_df['ethnicity'])[0].reshape(-1,1)
test_df.loc[:, 'gender_encode'] = pd.factorize(test_df['gender'])[0].reshape(-1,1)
test_df.loc[:, 'hospital_admit_source_encode'] = pd.factorize(test_df['hospital_admit_source'])[0].reshape(-1,1)
test_df.loc[:, 'icu_admit_source_encode'] = pd.factorize(test_df['icu_admit_source'])[0].reshape(-1,1)
test_df.loc[:, 'icu_stay_type_encode'] = pd.factorize(test_df['icu_stay_type'])[0].reshape(-1,1)
test_df.loc[:, 'icu_type_encode'] = pd.factorize(test_df['icu_type'])[0].reshape(-1,1)

# Create new variables to solve missing data and high correlation among predictors issues
test_df['d1_diasbp_invasive_diff'] = test_df['d1_diasbp_invasive_max'] - test_df['d1_diasbp_invasive_min']
test_df['d1_diasbp_diff'] = test_df['d1_diasbp_max'] - test_df['d1_diasbp_min']
test_df['d1_heartrate_diff'] = test_df['d1_heartrate_max'] - test_df['d1_heartrate_min']
test_df['d1_mbp_diff'] = test_df['d1_mbp_max'] - test_df['d1_mbp_min']
test_df['d1_spo2_diff'] = test_df['d1_spo2_max'] - test_df['d1_spo2_min']
test_df['d1_sysbp_diff'] = test_df['d1_sysbp_max'] - test_df['d1_sysbp_min']
test_df['d1_calcium_diff'] = test_df['d1_calcium_max'] - test_df['d1_calcium_min']
test_df['d1_inr_diff'] = test_df['d1_inr_max'] - test_df['d1_inr_min']
test_df['d1_temp_diff'] = test_df['d1_temp_max'] - test_df['d1_temp_min']
test_df['h1_diasbp_diff'] = test_df['h1_diasbp_max'] - test_df['h1_diasbp_min']
test_df['h1_heartrate_diff'] = test_df['h1_heartrate_max'] - test_df['h1_heartrate_min']
test_df['h1_mbp_diff'] = test_df['h1_mbp_max'] - test_df['h1_mbp_min']
test_df['h1_resprate_diff'] = test_df['h1_resprate_max'] - test_df['h1_resprate_min']
test_df['h1_spo2_diff'] = test_df['h1_spo2_max'] - test_df['h1_spo2_min']
test_df['h1_sysbp_diff'] = test_df['h1_sysbp_max'] - test_df['h1_sysbp_min']
test_df['gcs_sum']=test_df['gcs_eyes_apache']+test_df['gcs_motor_apache']+test_df['gcs_verbal_apache']
test_df['bmi_calculated'] = 10000 * test_df['weight']/(test_df['height']**2)

# Drop columns of the original variables after columns of further relevant and related variables are created
test_df = test_df.drop(['d1_diasbp_invasive_max', 'd1_diasbp_invasive_min', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_spo2_max', 'd1_spo2_min',
              'd1_sysbp_max', 'd1_sysbp_min', 'd1_calcium_max', 'd1_calcium_min', 'd1_inr_max', 'd1_inr_min', 'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min',
              'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max', 'h1_mbp_min', 'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_max', 'h1_sysbp_min', 
              'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_verbal_apache', 'ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'bmi'
              ], axis=1)

# Impute remaining missing value with zeroes
test_df = test_df.fillna(0)

#Read cleaned test_df dataframe ready for prediction from a selected machine learning model
test_df


(130157, 181)
(10234, 180)


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_unable_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_noninvasive_max,...,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,ethnicity_encode,gender_encode,hospital_admit_source_encode,icu_admit_source_encode,icu_stay_type_encode,icu_type_encode,d1_diasbp_invasive_diff,d1_diasbp_diff,d1_heartrate_diff,d1_mbp_diff,d1_spo2_diff,d1_sysbp_diff,d1_calcium_diff,d1_inr_diff,d1_temp_diff,h1_diasbp_diff,h1_heartrate_diff,h1_mbp_diff,h1_resprate_diff,h1_spo2_diff,h1_sysbp_diff,gcs_sum,bmi_calculated
0,1,144740,10141,72,0,152.4,82,0.015278,0,0.0,2.8,110.0,104.01,0,0,1.9,44.0,1.49,0.0,0.0,97.0,38.0,39.9,0,54.0,0.0,0.0,0.0,0.0,31.0,130.0,36.4,0.0,0,5.4,104.0,40.0,0.0,0.0,123.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,64.0,28.0,69.0,4.0,73.0,0.5,0.0,3.1,0.0,2.0,0.0,0.0,1.0,0.0,15.0,0.000000
1,2,141990,10141,86,0,175.3,82,0.000000,0,0.0,0.0,117.0,106.01,0,0,0.0,19.0,0.92,0.0,0.0,73.0,116.0,0.0,0,41.0,0.0,0.0,0.0,0.0,53.0,142.0,36.3,0.0,0,0.0,101.0,27.0,0.0,0.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,74.0,60.0,88.0,5.0,116.0,0.0,0.0,0.4,20.0,12.0,35.0,7.0,2.0,62.0,15.0,0.000000
2,3,142038,10141,72,0,162.6,82,0.003472,0,0.0,3.2,302.0,109.01,0,0,0.9,24.0,0.77,0.0,0.0,84.0,104.0,34.8,0,117.0,0.0,0.0,0.0,0.0,38.0,137.0,36.4,0.0,0,4.6,97.0,61.0,0.0,0.0,117.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,36.0,30.0,40.0,14.0,43.0,0.6,0.5,1.0,0.0,6.0,0.0,0.0,3.0,0.0,15.0,0.000000
3,4,138628,10141,66,0,177.8,82,0.884028,0,0.0,0.0,113.0,501.05,0,0,0.0,0.0,0.00,0.0,0.0,0.0,110.0,0.0,0,61.0,0.0,0.0,0.0,0.0,56.0,0.0,36.6,0.0,1,0.0,77.0,51.0,0.0,0.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.0,26.0,24.0,49.0,19.0,50.0,0.0,0.0,1.2,0.0,12.0,0.0,28.0,5.0,0.0,14.0,0.000000
4,5,141682,10141,89,0,170.2,82,0.013194,0,0.0,3.4,117.0,106.01,0,0,0.4,26.0,1.50,0.0,0.0,99.0,34.0,33.0,0,136.0,0.0,0.0,0.0,0.0,35.0,133.0,36.5,0.0,0,5.2,88.0,49.0,0.0,0.0,136.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,2,0,0,0,0.0,39.0,24.0,65.0,7.0,80.0,0.2,0.0,0.9,4.0,4.0,13.0,6.0,3.0,3.0,15.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10229,10230,143750,10140,36,0,170.1,1108,1.696528,0,108.6,0.0,305.0,901.02,0,0,0.0,0.0,0.00,0.0,0.0,0.0,111.0,29.0,0,127.0,0.0,0.0,0.0,0.0,45.0,0.0,36.5,2284.0,0,7.2,98.0,68.0,0.0,0.0,127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,30.0,43.0,35.0,3.0,48.0,0.0,0.0,0.3,4.0,17.0,3.0,0.0,0.0,4.0,14.0,37.533684
10230,10231,143813,10140,61,0,160.0,1108,0.033333,0,82.3,0.0,124.0,305.02,0,0,0.0,33.0,1.15,0.0,0.0,94.0,106.0,27.0,0,166.0,0.0,0.0,0.0,0.0,49.0,139.0,36.7,6911.0,0,11.2,116.0,56.0,0.0,0.0,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,60.0,37.0,85.0,5.0,102.6,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,15.0,32.148438
10231,10232,137126,10140,74,0,165.1,1108,0.757639,0,62.0,0.0,113.0,501.06,0,0,0.0,0.0,0.00,0.0,0.0,0.0,47.0,0.0,0,113.0,0.0,0.0,0.0,0.0,41.0,0.0,36.6,4495.0,0,0.0,82.0,49.0,0.0,0.0,118.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,8,1,0,0,0.0,33.0,30.0,51.0,7.0,88.0,0.0,0.0,0.4,7.0,9.0,7.0,3.0,3.0,10.0,15.0,22.745608
10232,10233,135652,10140,90,0,160.0,1108,0.087500,0,50.9,0.0,108.0,203.01,0,0,0.0,0.0,0.00,0.0,0.0,0.0,94.0,0.0,0,104.0,0.0,0.0,0.0,0.0,54.0,0.0,36.1,5893.0,0,0.0,70.0,57.0,0.0,0.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,13.0,68.0,25.0,4.0,57.0,0.0,0.0,0.6,6.0,3.0,6.0,2.0,0.0,0.0,14.0,19.882812


In [None]:
#split dataset in features and target variable in the worksheet dataframe, df
feature_cols = [
 'age',
 'bmi_calculated',
 'apache_2_diagnosis',
 'apache_3j_diagnosis',
 'gcs_sum',
 'heart_rate_apache',
 'map_apache',
 'resprate_apache',
 'temp_apache',
 'd1_diasbp_diff',
 'd1_heartrate_diff',
 'd1_mbp_diff',
 'd1_resprate_max',
 'd1_spo2_diff',
 'd1_sysbp_diff',
 'd1_temp_diff',
 'd1_bun_max',
 'd1_calcium_diff',
 'd1_glucose_max',
 'd1_hco3_max',
 'd1_platelets_max',
 'd1_potassium_min',
 'd1_sodium_min',
 'h1_glucose_max',
 ]
X = df[feature_cols] # Features
y = df.diabetes_mellitus # Target variable

#Keep only relevant features in the predicting dataframe, testing_df, similar to the features in the worksheet dataframe immediately above
test_feature_cols = [
 'age',
 'bmi_calculated',
 'apache_2_diagnosis',
 'apache_3j_diagnosis',
 'gcs_sum',
 'heart_rate_apache',
 'map_apache',
 'resprate_apache',
 'temp_apache',
 'd1_diasbp_diff',
 'd1_heartrate_diff',
 'd1_mbp_diff',
 'd1_resprate_max',
 'd1_spo2_diff',
 'd1_sysbp_diff',
 'd1_temp_diff',
 'd1_bun_max',
 'd1_calcium_diff',
 'd1_glucose_max',
 'd1_hco3_max',
 'd1_platelets_max',
 'd1_potassium_min',
 'd1_sodium_min',
 'h1_glucose_max',
 ]
Xt = test_df[feature_cols] # Features for the test-df dataframe

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

X_train.shape

(91109, 24)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from matplotlib import pyplot
from numpy import mean
from numpy import std
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


bestmodel = GradientBoostingClassifier(n_estimators=100, subsample = 0.6, max_features=11, learning_rate = 0.1, max_depth = 7)
bestmodel.fit(X_train,y_train)
predictions = bestmodel.predict(X_test)

print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

predictions_sub = bestmodel.predict(Xt)


0.8209383323089531
[[28474  2142]
 [ 4850  3582]]
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     30616
           1       0.63      0.42      0.51      8432

    accuracy                           0.82     39048
   macro avg       0.74      0.68      0.70     39048
weighted avg       0.81      0.82      0.81     39048



In [None]:
from numpy import arange
def get_models():
  models = dict()
  for i in [0.0001, 0.001, 0.01, 0.1, 1.0]:
    for j in range(1,21):
      for k in arange(0.1, 1.1, 0.1):
        key = "learning_rate = {:.4f} max_features = {:,.4f} subsample= {:,.1f}".format(i,j,k) 
        models[key] = GradientBoostingClassifier(learning_rate=i, max_features=j, subsample=k)
  return models
 
def evaluate_model(model, X, y):
	
	scores = model.score(X, y)
	return scores
 

models = get_models()
results, names = list(), list()
for name, model in models.items():
  model.fit(X_train, y_train)
  scores = model.score(X_test, y_test)
  results.append(scores)
  names.append(name)
  print('>%s %.3f ' % (name, mean(scores)))


>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.1 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.2 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.3 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.4 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.5 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.6 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.7 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.8 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 0.9 0.784 
>learning_rate = 0.0001 max_features = 1.0000 subsample= 1.0 0.784 
>learning_rate = 0.0001 max_features = 2.0000 subsample= 0.1 0.784 
>learning_rate = 0.0001 max_features = 2.0000 subsample= 0.2 0.784 
>learning_rate = 0.0001 max_features = 2.0000 subsample= 0.3 0.784 
>learning_rate = 0.0001 max_features = 2.0000 subsample= 0.4 0.784 
>learning_rate = 0.0001 max_features = 2.0000 su