# XGBoost Grid Search + Training

From our paper on "Explainable Prediction of Acute Myocardial Infarction using Machine Learning and Shapley Values"

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import xgboost
import time
import pickle
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Loading Data

In [2]:
# Load data
# Import train and test data into dataframes from csv files produced using the data processing code
df_train = pd.read_csv("train.csv", header=None)
df_train = df_train.sample(frac=1)
df_test = pd.read_csv("test.csv", header=None)

In [3]:
# Get data from dataframes
train_y = np.array(df_train[11].values).astype(np.int8)
train_x = np.array(df_train[list(range(11))].values)
test_y = np.array(df_test[11].values).astype(np.int8)
test_x = np.array(df_test[list(range(11))].values)

In [4]:
# Check the MI distribution in the training set
unique, counts = np.unique(train_y, return_counts=True)
dict(zip(unique, counts))

{0: 281967, 1: 141064}

In [5]:
# Check the MI distribution in the testing set
unique, counts = np.unique(test_y, return_counts=True)
dict(zip(unique, counts))

{0: 70559, 1: 35199}

## Grid search for model optimization

In [6]:
# Grid Search to optimize the model 
parameters = {'booster':('gbtree', 'gblinear', 'dart'), 'learning_rate':[0.01, 0.1, 1], 'n_estimators':[10,50,100]}

XGB = XGBClassifier(random_state=0)
clf = GridSearchCV(XGB, parameters)
clf.fit(train_x, train_y)
sorted(clf.cv_results_.keys())



['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_booster',
 'param_learning_rate',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [7]:
# Obtain the parameters for the best model 
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

## Model Training 

In [8]:
# Train the XGBoost model with the optimal parameters 
model = XGBClassifier(learning_rate = 1)
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [9]:
# Save model to file
pickle.dump(model, open("xgboost_ecgview.model", "wb"))