In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [29]:
data = pd.read_csv('../../notebooks/unnormalized.csv')
data.drop(columns=['Unnamed: 0'], inplace = True)
data

Unnamed: 0,Age,Gender,BMI,Symptoms Present?,Fever,Cough,Breathlessness,Travel History,Temp,SPO2,...,POTASSIUM,CHLORIDE,TOTAL BILIRUBIN,DIRECT BILIRUBIN,SGOT,SGPT,TOTAL PROTEINS,ALBUMIN,ALKALINE PHOSPHATASE,C-REACTIVE PROTEINS
0,53,1,22.5,1,1,1,1,0,96.8,99.0,...,4.8,108.0,0.5,0.2,81.3,70.0,5.9,3.8,44.1,58.10
1,26,0,25.7,0,0,0,0,0,98.7,98.0,...,4.1,108.0,0.3,0.1,22.2,14.8,6.6,3.9,58.5,3.66
2,28,1,22.2,0,0,0,0,0,98.4,98.0,...,18.1,1.1,0.8,0.3,19.3,12.8,7.0,4.2,86.0,10.17
3,73,1,21.5,1,1,1,1,0,98.0,98.0,...,4.2,104.0,2.4,1.2,59.0,47.9,6.3,3.7,120.0,168.90
4,49,1,27.4,1,1,1,0,0,101.0,98.0,...,3.8,92.0,4.2,2.1,44.6,55.5,5.9,3.1,177.0,164.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,53,1,27.2,1,1,1,1,0,98.9,96.0,...,2.9,97.0,1.4,0.5,43.8,38.8,6.2,3.7,73.3,127.60
171,33,1,26.0,1,1,1,1,0,99.3,98.0,...,4.2,106.0,0.5,0.2,80.6,42.6,6.6,3.8,57.4,138.15
172,70,1,21.4,1,1,0,1,0,98.3,99.0,...,4.2,106.0,1.8,0.6,77.0,27.9,5.9,3.3,60.1,143.00
173,65,0,22.4,1,0,1,1,0,98.9,97.0,...,3.8,110.0,1.2,0.6,56.2,43.2,5.6,3.4,216.0,124.00


In [30]:
# data.drop(['qSOFA SCORE'],inplace=True)
X= data.drop(['Outcome','qSOFA SCORE'],axis=1)
Y = data['Outcome']

In [35]:
def nestedcv(pipeline,param_grid, X, Y):
    
    
    f1 = [0]*7
    roc = [0]*7
    prec = [0]*7
    rec = [0]*7
    acc = [0]*7
    
    
    cv_outer = KFold(n_splits=7, random_state=1, shuffle=True)
    i = 0
    
    for train_ix, test_ix in cv_outer.split(X):
        
        X_train = X.iloc[train_ix]
        X_test =  X.iloc[test_ix]
        y_train = Y.iloc[train_ix]
        y_test =  Y.iloc[test_ix]
        scaler = MinMaxScaler()
        
        model = ExtraTreesClassifier()
        model.fit(scaler.fit_transform(X_train),y_train)

        feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
        x = feat_importances.nlargest(5)
        features = np.array(x.index)
        print(features)

        gs = GridSearchCV(estimator=pipeline, param_grid = param_grid,
                     cv = 6, scoring = 'accuracy', n_jobs = -1, refit = True)
        
        result = gs.fit(X_train[features],y_train)
        print(result.best_params_)
        best_model = result.best_estimator_
        
        y_hat = best_model.predict(X_test[features])
        f1[i] = metrics.f1_score(y_test, y_hat)
        roc[i] = metrics.roc_auc_score(y_test, y_hat)
        prec[i] = metrics.precision_score(y_test, y_hat)
        rec[i] = metrics.recall_score(y_test,y_hat)
        acc[i] = metrics.accuracy_score(y_test,y_hat)
        i+=1
        print(i)
    
    
    arr = [np.mean(acc),np.mean(f1),np.mean(roc),np.mean(prec),np.mean(rec),]
    return arr

In [36]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),LogisticRegression(max_iter=10000))

param_grid = {'logisticregression__solver' : ['newton-cg', 'lbfgs', 'liblinear','sag','saga'],
'logisticregression__penalty' : ['l2'],
'logisticregression__C' : [300, 100, 30, 10, 3, 1.0, 0.3, 0.1, 0.03, 0.01]} 

lg = nestedcv(pipeline,param_grid,X,Y)
lg

['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'SGOT' 'Age']
{'logisticregression__C': 100, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
1
['Respiratory rate(breaths per minute)' 'Breathlessness'
 'C-REACTIVE PROTEINS' 'TLC COUNT' 'Age']
{'logisticregression__C': 300, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
2
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'UREA']
{'logisticregression__C': 30, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
3
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']
{'logisticregression__C': 30, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
4
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']
{'logisticregression__C': 300, 'logisticregression__penalty': 'l2

  _warn_prf(average, modifier, msg_start, len(result))


[0.9257142857142856,
 0.6099773242630385,
 0.7802936194240544,
 0.6785714285714286,
 0.5928571428571427]

In [37]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),
                         RandomForestClassifier())
#
# Create the parameter grid
#

param_grid = {
    'randomforestclassifier__n_estimators': [400, 700],
    'randomforestclassifier__max_depth': [15,20],
    'randomforestclassifier__max_leaf_nodes': [50, 100]
}

rf = nestedcv(pipeline,param_grid,X,Y)
rf

['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']
{'randomforestclassifier__max_depth': 15, 'randomforestclassifier__max_leaf_nodes': 50, 'randomforestclassifier__n_estimators': 400}
1
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'randomforestclassifier__max_depth': 15, 'randomforestclassifier__max_leaf_nodes': 50, 'randomforestclassifier__n_estimators': 700}
2
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'UREA']
{'randomforestclassifier__max_depth': 15, 'randomforestclassifier__max_leaf_nodes': 50, 'randomforestclassifier__n_estimators': 400}
3
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'randomforestclassifier__max_depth': 15, 'randomforestclassifier__max_leaf_nodes': 50, 'randomforestclassifier__n_estimators': 400}
4
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breat

[0.96,
 0.8166666666666667,
 0.8730848861283643,
 0.9285714285714286,
 0.7523809523809524]

In [38]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),
                         MLPClassifier(max_iter=3000))
#
# Create the parameter grid
#
pipeline.get_params().keys()
param_grid = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,)],
    'mlpclassifier__activation': ['relu','logistic'],
    'mlpclassifier__solver': ['lbfgs'],
    'mlpclassifier__alpha': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
    'mlpclassifier__learning_rate': ['constant','adaptive'],
}

mlp = nestedcv(pipeline,param_grid,X,Y)
mlp

['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.1, 'mlpclassifier__hidden_layer_sizes': (50,), 'mlpclassifier__learning_rate': 'constant', 'mlpclassifier__solver': 'lbfgs'}
1
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.3, 'mlpclassifier__hidden_layer_sizes': (100,), 'mlpclassifier__learning_rate': 'constant', 'mlpclassifier__solver': 'lbfgs'}
2
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'UREA']
{'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.1, 'mlpclassifier__hidden_layer_sizes': (100,), 'mlpclassifier__learning_rate': 'constant', 'mlpclassifier__solver': 'lbfgs'}
3
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']
{'mlpclassifier__activation': 'log

[0.96,
 0.8362193362193361,
 0.9042073351079561,
 0.8761904761904761,
 0.8285714285714285]

In [39]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),
                         XGBClassifier())
#
# Create the parameter grid
#
# pipeline.get_params().keys()

param_grid = {
    'xgbclassifier__n_estimators': [400, 700],
    'xgbclassifier__colsample_bytree': [0.7, 0.8],
    'xgbclassifier__max_depth': [15,20],
    'xgbclassifier__reg_alpha': [1.1, 1.2],
    'xgbclassifier__reg_lambda': [1.1, 1.2],
    'xgbclassifier__subsample': [0.7, 0.8]
}
xgb = nestedcv(pipeline,param_grid,X,Y)
xgb

['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'SGOT']





















































































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.7, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 700, 'xgbclassifier__reg_alpha': 1.1, 'xgbclassifier__reg_lambda': 1.2, 'xgbclassifier__subsample': 0.8}
1
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']




























































































































































































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.7, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__reg_alpha': 1.1, 'xgbclassifier__reg_lambda': 1.1, 'xgbclassifier__subsample': 0.8}
2
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'ALKALINE PHOSPHATASE']












































































































































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.8, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__reg_alpha': 1.1, 'xgbclassifier__reg_lambda': 1.1, 'xgbclassifier__subsample': 0.7}
3
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'UREA']





































































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.8, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__reg_alpha': 1.2, 'xgbclassifier__reg_lambda': 1.1, 'xgbclassifier__subsample': 0.8}
4
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']




































































































































































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.8, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__reg_alpha': 1.1, 'xgbclassifier__reg_lambda': 1.1, 'xgbclassifier__subsample': 0.8}
5
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'TLC COUNT' 'Age']
















































































































































































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.7, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__reg_alpha': 1.1, 'xgbclassifier__reg_lambda': 1.1, 'xgbclassifier__subsample': 0.8}
6
['Respiratory rate(breaths per minute)' 'Breathlessness'
 'C-REACTIVE PROTEINS' 'Age' 'TLC COUNT']




























































































































































































































































































































































































































































































































































































































































































































































{'xgbclassifier__colsample_bytree': 0.7, 'xgbclassifier__max_depth': 15, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__reg_alpha': 1.2, 'xgbclassifier__reg_lambda': 1.1, 'xgbclassifier__subsample': 0.7}
7


[0.937142857142857,
 0.7531746031746032,
 0.8485796294802507,
 0.8238095238095238,
 0.7238095238095238]

In [40]:
svm = SVC()
pipeline = make_pipeline(MinMaxScaler(),
                         SVC())
#
# Create the parameter grid
#
param_grid = {'svc__C': [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000], 
              'svc__gamma': [3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001],
              'svc__kernel': ['rbf','linear','sigmoid','poly']} 


svc = nestedcv(pipeline,param_grid,X,Y)
svc


['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'svc__C': 10, 'svc__gamma': 3, 'svc__kernel': 'poly'}
1
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'svc__C': 30, 'svc__gamma': 3, 'svc__kernel': 'linear'}
2
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'svc__C': 3, 'svc__gamma': 3, 'svc__kernel': 'poly'}
3
['C-REACTIVE PROTEINS' 'Respiratory rate(breaths per minute)'
 'Breathlessness' 'Age' 'TLC COUNT']
{'svc__C': 30, 'svc__gamma': 3, 'svc__kernel': 'poly'}
4
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'svc__C': 3, 'svc__gamma': 3, 'svc__kernel': 'rbf'}
5
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Breathlessness' 'Age' 'TLC COUNT']
{'svc__C': 3, 'svc__gamma': 3, 'svc__kernel': 'rbf'}
6
['Respiratory rate(breaths per minute)' 'C-REACTIVE PROTEINS'
 'Bre

[0.9314285714285714,
 0.6825396825396826,
 0.8141969831410825,
 0.7877551020408163,
 0.6619047619047619]

In [44]:
headings = ['Model','Accuracy Score','Precision Score','Recall Score','F1 Score','ROC-AUC']
data = [['Logistic Regression',0.926, 0.678, 0.593, 0.610,0.780], ['Random Forest',0.96,0.928,
 0.752,
 0.817,
 0.873
 ], ['Support Vector Machine',
0.931,
                                                                                       0.787,
 0.662,
 0.683,
 0.814,
 ],['XGBoost', 0.937,
    0.824,
 0.724,
 0.753,
 0.849,
 ],['Multi-Layer Perception',0.960,
                 0.876,
 0.828,
 0.836,
 0.904,
 ]]
df = pd.DataFrame(data, columns = headings)
df

Unnamed: 0,Model,Accuracy Score,Precision Score,Recall Score,F1 Score,ROC-AUC
0,Logistic Regression,0.926,0.678,0.593,0.61,0.78
1,Random Forest,0.96,0.928,0.752,0.817,0.873
2,Support Vector Machine,0.931,0.787,0.662,0.683,0.814
3,XGBoost,0.937,0.824,0.724,0.753,0.849
4,Multi-Layer Perception,0.96,0.876,0.828,0.836,0.904
