In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics



In [4]:
data = pd.read_csv('../../notebooks/unnormalized.csv')
data.drop(columns=['Unnamed: 0'], inplace = True)
data

Unnamed: 0,Age,Gender,BMI,Symptoms Present?,Fever,Cough,Breathlessness,Travel History,Temp,SPO2,...,POTASSIUM,CHLORIDE,TOTAL BILIRUBIN,DIRECT BILIRUBIN,SGOT,SGPT,TOTAL PROTEINS,ALBUMIN,ALKALINE PHOSPHATASE,C-REACTIVE PROTEINS
0,53,1,22.5,1,1,1,1,0,96.8,99.0,...,4.8,108.0,0.5,0.2,81.3,70.0,5.9,3.8,44.1,58.10
1,26,0,25.7,0,0,0,0,0,98.7,98.0,...,4.1,108.0,0.3,0.1,22.2,14.8,6.6,3.9,58.5,3.66
2,28,1,22.2,0,0,0,0,0,98.4,98.0,...,18.1,1.1,0.8,0.3,19.3,12.8,7.0,4.2,86.0,10.17
3,73,1,21.5,1,1,1,1,0,98.0,98.0,...,4.2,104.0,2.4,1.2,59.0,47.9,6.3,3.7,120.0,168.90
4,49,1,27.4,1,1,1,0,0,101.0,98.0,...,3.8,92.0,4.2,2.1,44.6,55.5,5.9,3.1,177.0,164.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,53,1,27.2,1,1,1,1,0,98.9,96.0,...,2.9,97.0,1.4,0.5,43.8,38.8,6.2,3.7,73.3,127.60
171,33,1,26.0,1,1,1,1,0,99.3,98.0,...,4.2,106.0,0.5,0.2,80.6,42.6,6.6,3.8,57.4,138.15
172,70,1,21.4,1,1,0,1,0,98.3,99.0,...,4.2,106.0,1.8,0.6,77.0,27.9,5.9,3.3,60.1,143.00
173,65,0,22.4,1,0,1,1,0,98.9,97.0,...,3.8,110.0,1.2,0.6,56.2,43.2,5.6,3.4,216.0,124.00


In [5]:
# data.drop(['qSOFA SCORE'],inplace=True)
X= data.drop(['Outcome'],axis=1)
Y = data['Outcome']

In [6]:
def nestedcv(pipeline,param_grid,arr, X, Y):
    
    gs = GridSearchCV(estimator=pipeline, param_grid = param_grid,
                 cv = 5, scoring = 'accuracy', n_jobs = -1, refit = True)
    
    model = ExtraTreesClassifier()
    model.fit(X,Y)

    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    x = feat_importances.nlargest(6)
    features = np.array(x.index)

    #
    # Calculate the generalization error / accuracy
    scores = cross_validate(gs, X[features], Y, scoring=['f1','roc_auc','precision','recall'], cv=7)
    #
    # Print the mean scores and standard deviation
    #

    for k,v in scores.items():
        if k=='score_time' or k=='fit_time':
            continue
        arr.append(np.mean(v))

    return arr

In [7]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),LogisticRegression(max_iter=10000))

param_grid = {'logisticregression__solver' : ['newton-cg', 'lbfgs', 'liblinear','sag','saga'],
'logisticregression__penalty' : ['l2'],
'logisticregression__C' : [300, 100, 30, 10, 3, 1.0, 0.3, 0.1, 0.03, 0.01]} 

lg = nestedcv(pipeline,param_grid,[],X,Y)
lg

[0.6204081632653062,
 0.9438775510204082,
 0.7142857142857143,
 0.5714285714285714]

In [8]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),
                         RandomForestClassifier())
#
# Create the parameter grid
#

param_grid = {
    'randomforestclassifier__n_estimators': [400, 700, 1000],
    'randomforestclassifier__max_depth': [15,20,25],
    'randomforestclassifier__max_leaf_nodes': [50, 100, 200]
}

rf = nestedcv(pipeline,param_grid,[],X,Y)
rf

[0.7506802721088436,
 0.9495980210265925,
 0.8452380952380951,
 0.7023809523809523]

In [None]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),
                         MLPClassifier(max_iter=10000))
#
# Create the parameter grid
#
pipeline.get_params().keys()
param_grid = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50,100)],
    'mlpclassifier__activation': ['tanh', 'relu','logistic'],
    'mlpclassifier__solver': ['lbfgs','sgd'],
    'mlpclassifier__alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
    'mlpclassifier__learning_rate': ['constant','adaptive','invscaling'],
}

mlp = nestedcv(pipeline,param_grid,[],X,Y)
mlp

In [None]:
# Pipeline created using Logistic Regression
pipeline = make_pipeline(MinMaxScaler(),
                         XGBClassifier())
#
# Create the parameter grid
#
# pipeline.get_params().keys()

param_grid = {
    'xgbclassifier__n_estimators': [400, 700, 1000],
    'xgbclassifier__colsample_bytree': [0.7, 0.8],
    'xgbclassifier__max_depth': [15,20,25],
    'xgbclassifier__reg_alpha': [1.1, 1.2, 1.3],
    'xgbclassifier__reg_lambda': [1.1, 1.2, 1.3],
    'xgbclassifier__subsample': [0.7, 0.8, 0.9]
}
xgb = nestedcv(pipeline,param_grid,[],X,Y)
xgb

In [11]:
svm = SVC()
pipeline = make_pipeline(MinMaxScaler(),
                         SVC())
#
# Create the parameter grid
#
param_grid = {'svc__C': [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000], 
              'svc__gamma': [3, 1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001, 'scale','auto'],
              'svc__kernel': ['rbf','linear','sigmoid','poly']} 


svc = nestedcv(pipeline,param_grid,[],X,Y)
svc


[0.719882498453927, 0.927643784786642, 0.7363945578231291, 0.738095238095238]