In [1]:
# Import required Libraries
import pandas as pd
import numpy as np

# For scaling the features and train-test split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# For model buidling
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# For hyper-paramter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# For evaluating the model performance
from sklearn.metrics import (f1_score, roc_auc_score,confusion_matrix, accuracy_score,
                             precision_score, recall_score, matthews_corrcoef)


# For reporting the results
from IPython.display import HTML, display
import tabulate

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read data file
df_train = pd.read_csv('training_frame.csv',index_col=False)
df_train.head(5)

Unnamed: 0.1,Unnamed: 0,vitals_datetime,heart_rate,respiration_over_impedence,spirometry_oxygen_saturation,pulse,blood_pressure_systolic,blood_pressure_diastolic,blood_pressure_average,patient_id,machine_id,Coded
0,0,2020-03-28 23:00:25+00:00,111.0,44.0,100.0,110.0,111.0,74.0,79.0,1585133,MICU-11,0.0
1,1,2020-03-28 23:01:10+00:00,85.0,12.0,97.0,92.0,95.0,60.0,77.0,1406566,BAY13-BZ,0.0
2,2,2020-03-28 23:30:29+00:00,92.0,6.0,99.0,92.0,113.0,71.0,93.0,1360999,SICU-09,0.0
3,3,2020-03-28 23:30:29+00:00,64.0,22.0,96.0,65.0,128.0,80.0,88.0,276287,NSU-03,0.0
4,4,2020-03-28 23:30:32+00:00,111.0,22.0,96.0,102.0,100.0,64.0,72.0,1267452,MICU-04,0.0


In [3]:
# reading test data
df_test = pd.read_csv('test_frame.csv',index_col=False)
df_test.head(5)

Unnamed: 0.1,Unnamed: 0,vitals_datetime,heart_rate,respiration_over_impedence,spirometry_oxygen_saturation,pulse,blood_pressure_systolic,blood_pressure_diastolic,blood_pressure_average,patient_id,machine_id,Coded
0,0,2020-08-01 00:00:01+00:00,87.0,11.0,99.0,113.0,127.0,55.0,75.0,1824098,6BTELE-31,0.0
1,1,2020-08-01 00:00:06+00:00,49.0,0.0,100.0,71.0,139.0,65.0,112.0,151310,6WNTX-16,0.0
2,2,2020-08-01 00:00:14+00:00,105.0,20.0,95.0,106.0,86.0,35.0,49.0,1823860,MICU-02,0.0
3,3,2020-08-01 00:00:23+00:00,82.0,23.0,100.0,82.0,187.0,112.0,141.0,1824302,MICU-06,0.0
4,4,2020-08-01 00:00:32+00:00,84.0,18.0,99.0,84.0,124.0,75.0,91.0,131903,MICU-05,0.0


In [4]:
# droping unnecessary columns
df_train = df_train.drop(['Unnamed: 0','vitals_datetime'], axis = 1)
df_test = df_test.drop(['Unnamed: 0','vitals_datetime'], axis = 1)

In [5]:
df_train.dtypes

heart_rate                      float64
respiration_over_impedence      float64
spirometry_oxygen_saturation    float64
pulse                           float64
blood_pressure_systolic         float64
blood_pressure_diastolic        float64
blood_pressure_average          float64
patient_id                        int64
machine_id                       object
Coded                           float64
dtype: object

In [6]:
df_train.isnull().sum() # Good No Null Values!

heart_rate                      0
respiration_over_impedence      0
spirometry_oxygen_saturation    0
pulse                           0
blood_pressure_systolic         0
blood_pressure_diastolic        0
blood_pressure_average          0
patient_id                      0
machine_id                      0
Coded                           0
dtype: int64

In [7]:
df_test.isnull().sum() # Good No Null Values!

heart_rate                      0
respiration_over_impedence      0
spirometry_oxygen_saturation    0
pulse                           0
blood_pressure_systolic         0
blood_pressure_diastolic        0
blood_pressure_average          0
patient_id                      0
machine_id                      0
Coded                           0
dtype: int64

In [8]:
# Encoding catagorical columns
le = LabelEncoder()
df_train['machine_id'] = le.fit_transform(df_train['machine_id'])
df_test['machine_id'] = le.fit_transform(df_test['machine_id'])

In [9]:
train_features = df_train.drop("Coded", axis=1)
train_labels = pd.DataFrame(df_train['Coded'])

In [10]:
test_features = df_test.drop("Coded", axis=1)
test_labels = pd.DataFrame(df_test['Coded'])

In [11]:
sc = StandardScaler()

In [12]:
train_features = sc.fit_transform(train_features)
train_features = pd.DataFrame(train_features)

In [13]:
test_features = sc.fit_transform(test_features)
test_features = pd.DataFrame(test_features)

In [14]:
# defining a function to evaluate trained models
def predict_and_evaluate(model, X_test, y_test):
    '''Predict values for given model & test dataset
    and evaluate the results in terms of FP, FN, F1-score,
    Brier Score, AUC and G-Mean'''
    
    predictions = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    mcc = matthews_corrcoef(y_test,predictions)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    
    metrics = [fp, fn, round(precision,2), round(recall,2), round(f1,2), round(mcc,2),round(accuracy,2)]
    table = [[model.__class__.__name__] + metrics]
    display(HTML(tabulate.tabulate(table,headers=('Algorithm','False Positives', 
                                                  'False Negatives', 'Precision', 
                                                  'Recall', 'F1 Score', 'MCC','accuracy'), 
                                   tablefmt='html')))
    #print(accuracy)
    return [model.__class__.__name__] + metrics

# Random Forest

In [15]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Instantiate model - 100 trees
rf = RandomForestClassifier(n_estimators= 100, 
                           random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels)

  rf.fit(train_features, train_labels)


RandomForestClassifier(random_state=42)

In [16]:
# Results of random forest classifier
rf_results = predict_and_evaluate(rf, test_features, test_labels)

Algorithm,False Positives,False Negatives,Precision,Recall,F1 Score,MCC,accuracy
RandomForestClassifier,493,28940,0.68,0.04,0.07,0.1,0.71


# Support Vector Machine

In [17]:
svm_clf = SVC()

In [18]:
svm_clf.fit(train_features, train_labels)

  return f(**kwargs)


SVC()

In [19]:
svm_res = predict_and_evaluate(svm_clf, test_features, test_labels)

Algorithm,False Positives,False Negatives,Precision,Recall,F1 Score,MCC,accuracy
SVC,6325,28446,0.2,0.05,0.08,-0.07,0.65


# Gradient boosting

In [20]:
gbm_clf = GradientBoostingClassifier()

In [21]:
gbm_clf.fit(train_features, train_labels)

  return f(**kwargs)


GradientBoostingClassifier()

In [22]:
gbm_res = predict_and_evaluate(gbm_clf, test_features, test_labels)

Algorithm,False Positives,False Negatives,Precision,Recall,F1 Score,MCC,accuracy
GradientBoostingClassifier,2872,25321,0.62,0.16,0.25,0.2,0.72


# XGboost

In [23]:
xgb_clf = XGBClassifier()

In [24]:
xgb_clf.fit(train_features, train_labels)

  return f(**kwargs)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
xgb_res = predict_and_evaluate(xgb_clf,  test_features, test_labels)

Algorithm,False Positives,False Negatives,Precision,Recall,F1 Score,MCC,accuracy
XGBClassifier,7181,22104,0.52,0.26,0.35,0.21,0.71


# Putting all the results together

In [26]:
display(HTML('<h3>With Default parameters:</h3>'))
table = [svm_res,gbm_res,xgb_res,rf_results]
display(HTML(tabulate.tabulate(table,headers=('Algorithm','False Positives', 
                                                  'False Negatives', 'Precision', 
                                                  'Recall', 'F1 Score', 'MCC','accuracy'),
                               tablefmt='html')))

Algorithm,False Positives,False Negatives,Precision,Recall,F1 Score,MCC,accuracy
SVC,6325,28446,0.2,0.05,0.08,-0.07,0.65
GradientBoostingClassifier,2872,25321,0.62,0.16,0.25,0.2,0.72
XGBClassifier,7181,22104,0.52,0.26,0.35,0.21,0.71
RandomForestClassifier,493,28940,0.68,0.04,0.07,0.1,0.71


# Hyper tuning

In [26]:
rf_params = {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
             'n_estimators': [100, 200, 300,]}

In [27]:
grid_rf = RandomizedSearchCV(estimator = rf, param_distributions = rf_params, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [28]:
grid_rf.fit(train_features, train_labels)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  3.2min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100],
                                        'n_estimators': [100, 200, 300]},
                   random_state=42, verbose=2)

In [29]:
grid_rf.best_params_

{'n_estimators': 300, 'max_depth': 40}

In [30]:
rf = grid_rf.best_estimator_

In [31]:
rf_res_hpo = predict_and_evaluate(rf, test_features, test_labels)

Algorithm,False Positives,False Negatives,Precision,Recall,F1 Score,MCC,accuracy
RandomForestClassifier,622,27350,0.81,0.09,0.16,0.2,0.72


In [33]:
# Parameter matrix for SVC
svc_params = {'C': [0.5, 0.7, 0.9, 1],
              'kernel': ['rbf', 'poly', 'sigmoid', 'linear'],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

In [34]:
grid_svc = GridSearchCV(SVC(), svc_params, n_jobs = -1)

In [None]:
grid_svc.fit(train_features, train_labels)

In [None]:
# SVC best parameters
grid_svc.best_params_

In [None]:
# SVC best estimator
svc = grid_svc.best_estimator_

In [None]:
# predictons
svm_res_hpo = predict_and_evaluate(svc, test_features, test_labels)

In [None]:
# Parameter matrix for GBM
gbm_params = {  "n_estimators":[100,150,200],
                "learning_rate": [0.01, 0.025, 0.05],
                "max_depth":[3,5],
                "subsample":[ 0.8, 0.9,1.0]}

In [None]:
grid_gbm = GridSearchCV(GradientBoostingClassifier(), gbm_params, n_jobs = -1)

In [None]:
%time
grid_gbm.fit(train_features, train_labels)

In [None]:
# GBM best parameters
grid_gbm.best_params_

In [None]:
gbm = grid_gbm.best_estimator_

In [None]:
# predictons
gbm_res_hpo = predict_and_evaluate(gbm, test_features, test_labels)

In [None]:
xgb_params = {'n_estimators' : [100,400],
         "learning_rate"    : [0.01,0.05] ,
         "max_depth"        : [ 3, 5]}
         "min_child_weight" : [ 3, 5]}

In [None]:
grid_xgb = GridSearchCV(XGBClassifier(), xgb_params, n_jobs = -1)

In [None]:
%time
grid_xgb.fit(train_features, train_labels)

In [None]:
grid_xgb.best_params_

In [None]:
xgb = grid_xgb.best_estimator_

In [None]:
# predictons
xgb_res_hpo = predict_and_evaluate(xgb, test_features, test_labels)

In [None]:
display(HTML('<h3>With Hyperparameter Optimization:</h3>'))

table = [svm_res_hpo ,gbm_res_hpo ,xgb_res_hpo,rf_res_hpo]
display(HTML(tabulate.tabulate(table,headers=('Algorithm','False Positives', 
                                                  'False Negatives', 'Precision', 
                                                  'Recall', 'F1 Score', 'MCC','accuracy'),
                               tablefmt='html')))