In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,recall_score
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold,GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
from xgboost import XGBClassifier
import numpy as np
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot

In [None]:
data=pd.read_csv('cleaned.csv') 
data=data.fillna(data.mean())
le=LabelEncoder()
le.fit(data['SARS-Cov-2 exam result'])
data['SARS-Cov-2 exam result']=le.transform(data['SARS-Cov-2 exam result'])

In [None]:
X=data.drop('SARS-Cov-2 exam result',axis=1)
X=X.astype(np.float64)
y=data['SARS-Cov-2 exam result']

In [None]:
_X=X.astype(np.float32)
_y=y.astype(np.float32)

In [None]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: SARS-Cov-2 exam result, dtype: int64

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=1000, 
                                                  max_depth=5,
                                                  min_child_weight=1,
                                                  gamma=0, 
                                                  subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=7.26, seed=27), 
                        param_grid = param_test1, 
                        scoring='accuracy',
                        n_jobs=4,
                        cv=5)

gsearch1.fit(train_X,train_y)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 1}, 0.8582905982905983)

In [None]:
param_test2 = { 
    'gamma':[i/10.0 for i in range(0,5)] 
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=1000, 
                                                  max_depth = 7,
                                                  min_child_weight = 1,
                                                  gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=7.26,seed=27), 
                        param_grid = param_test2, 
                        scoring='accuracy',
                        n_jobs=4,
                        cv=5)

gsearch3.fit(train_X,train_y)
gsearch3.best_params_, gsearch3.best_score_

({'gamma': 0.3}, 0.8671550671550671)

In [None]:
param_test3 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, 
                                                  max_depth = 7,
                                                  min_child_weight = 1,
                                                  gamma=0.2, 
                                                  subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=7.26,seed=27), 
                        param_grid = param_test3, 
                        scoring='accuracy',
                        n_jobs=4,
                        cv=5)

gsearch4.fit(train_X,train_y)
gsearch4.best_params_, gsearch4.best_score_

({'colsample_bytree': 0.7, 'subsample': 0.9}, 0.8737728937728937)

In [None]:
# On réduit --- 
param_test4 = {
 'learning_rate':[i/1000.0 for i in range(5,20,2)]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1000, 
                                                  gamma=0.2,
                                                  subsample=0.8,
                                                  colsample_bytree=0.8,
                                                  max_depth = 7,
                                                  min_child_weight = 1,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=7.26,seed=27), 
                        param_grid = param_test4, 
                        scoring='accuracy',
                        n_jobs=4,
                        cv=5)

gsearch6.fit(train_X,train_y)
gsearch6.best_params_, gsearch6.best_score_

({'learning_rate': 0.005}, 0.884932844932845)

In [None]:

'''
After extensive testing the following parameters were found to produce the best accuracy and recall combination
max_depth :- 3
min_child_weight :- 3
gamma :- 0.3
colsample_bytree :- 0.7
subsample :- 0.8
learning_rate :- 0.005
scale_pos_weight :- 7.26(ratio between positive and negative samples)
'''
model = XGBClassifier(
    learning_rate =0.005,
    n_estimators=500,
    max_depth = 3,
    min_child_weight = 3,
    gamma=0.3,
    subsample=0.7,
    colsample_bytree=0.8,
    reg_alpha=0.005,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=7.26,
    seed=27
)
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=52)
scores_roc = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_recall = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)
scores_accuracy = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Mean roc',mean(scores_roc))
print('Mean recall',mean(scores_recall))
print('Mean accuracy',mean(scores_accuracy))
print('Positive Samples :- {} Negative Samples :- {}'.format(len(y[y==1]),len(y)))

Mean roc 0.8906746031746032
Mean recall 0.7386904761904763
Mean accuracy 0.8424540103016924
Positive Samples :- 83 Negative Samples :- 603


In [None]:
#Final Model
model = XGBClassifier(
    learning_rate =0.005,
    n_estimators=1000,
    max_depth = 3,
    min_child_weight = 3,
    gamma=0.3,
    subsample=0.7,
    colsample_bytree=0.8,
    reg_alpha=0.005,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=7.26,
    seed=27
)
model.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.3,
              learning_rate=0.005, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=None, n_estimators=1000, n_jobs=1,
              nthread=4, objective='binary:logistic', random_state=0,
              reg_alpha=0.005, reg_lambda=1, scale_pos_weight=7.26, seed=27,
              silent=None, subsample=0.7, verbosity=1)

In [None]:
booster=model.get_booster()
print(booster.get_dump()[0])
ls=list(model.feature_importances_)
print(ls)
features=pd.DataFrame({
    'Feature_Name':X.columns,
    'Feature_Import':ls,
})
features.sort_values(by=['Feature_Import'],inplace=True,ascending=False)
features

0:[Leukocytes<-0.491158187] yes=1,no=2,missing=1
	1:[Mean platelet volume <-0.942966223] yes=3,no=4,missing=3
		3:leaf=-0.00817727204
		4:[Basophils<0.53988117] yes=7,no=8,missing=7
			7:leaf=0.00731905317
			8:leaf=-0.00045356687
	2:[Eosinophils<-0.645880461] yes=5,no=6,missing=5
		5:[Platelets<0.525217175] yes=9,no=10,missing=9
			9:leaf=0.00313292257
			10:leaf=-0.00866333302
		6:[Platelets<-1.09525633] yes=11,no=12,missing=11
			11:leaf=0.00138376397
			12:leaf=-0.00904118735

[0.050192274, 0.064600855, 0.034780618, 0.059806336, 0.053749274, 0.044975977, 0.09875116, 0.04179627, 0.051394388, 0.048176672, 0.03335774, 0.13064466, 0.039756574, 0.041565433, 0.08162178, 0.038814254, 0.05859377, 0.027421916]


Unnamed: 0,Feature_Name,Feature_Import
11,Leukocytes,0.130645
6,Platelets,0.098751
14,Eosinophils,0.081622
1,"Patient addmited to regular ward (1=yes, 0=no)",0.064601
3,Patient addmited to intensive care unit (1=yes...,0.059806
16,Monocytes,0.058594
4,Hematocrit,0.053749
8,Red blood Cells,0.051394
0,Patient age quantile,0.050192
9,Lymphocytes,0.048177
