In [105]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, make_scorer, recall_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing data

In [2]:
feats_dummy = pd.read_pickle(r'/Users/philliprichardson/Metis/Module 4/feats.pkl')
outcome = pd.read_pickle(r'/Users/philliprichardson/Metis/Module 4/outcome.pkl')


## Baseline XGBoost Model

In [3]:
kf = StratifiedKFold(n_splits = 3, random_state = 13, shuffle = True)

xtrain, xtest, ytrain, ytest = train_test_split(feats_dummy, outcome, test_size=0.2, stratify = outcome, random_state = 13)

ev =[(xtrain,ytrain),(xtest,ytest)]

In [14]:
gbm = xgb.XGBClassifier(n_estimators=300, #arbitrary large number
                       max_depth=3,
                       objective="multi:softmax",  # Other options: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
                       learning_rate=.1, 
                       subsample=1,
                       min_child_weight=1,
                       colsample_bytree=.8)

gbm.fit(xtrain, ytrain, early_stopping_rounds=20, eval_metric = 'mlogloss' , eval_set = ev, verbose = 0)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Creating sample weights to account for class imbalance

In [4]:
classes = outcome.unique()
weights = compute_class_weight('balanced', classes, y = outcome)
outcome.unique(), weights
weight = {}
for i in range(3):
    weight[classes[i]] = weights[i]



## Setting Fit Params

In [139]:
sample_weights = [weight[y] for y in ytrain]


params = {'sample_weight' : sample_weights,
         'eval_metric' : 'mlogloss',
         'eval_set':ev,
         'verbose': False}

In [38]:
bal = xgb.XGBClassifier(n_estimators=300, #arbitrary large number
                       max_depth=3,
                       objective="multi:softmax",  # Other options: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
                       learning_rate=.1, 
                       subsample=1,
                       min_child_weight=1,
                       colsample_bytree=.8)



In [39]:
weighted = cross_val_score(bal, xtrain, ytrain, cv = kf, 
                           scoring = make_scorer(recall_score, average = 'macro'),
                          fit_params = params )














array([0.59769741, 0.59369269, 0.59928834])

In [40]:
baseline = cross_val_score(gbm, xtrain, ytrain, cv = kf, 
                           scoring = make_scorer(recall_score, average = 'macro'))














## Second baseline with class imbalance accountedd for

In [42]:
np.mean(baseline), np.mean(weighted)

(0.4873665568175572, 0.5968928126316188)

## Creating a pseudo randomsearchCV, sklearn randomsearchCV couldn't handle fit_params

In [148]:
n_estimators = [100, 500, 1000]
max_depth = [2,3,5,7, 10]
learning = [.05, .1, .15, .2, .25, .3]
subsample = [.8, .85, .9, 1]
minchild = [1,3,5,7,10]
colsample = [.5, .8, 1]

In [208]:
rand_params = []

for i in n_estimators:
    for j in max_depth:
        for k in learning:
            for l in subsample:
                for m in minchild:
                    for n in colsample:
                        rand_params.append([i,j,k,l,m,n])
                        
            

In [155]:
tuning2 = []

In [157]:
for i in range(100):
    
    idx = np.random.randint(len(rand_params))

    gbm_tuning = xgb.XGBClassifier(n_estimators=rand_params[idx][0], 
                       max_depth=rand_params[idx][1],
                       objective="multi:softmax",  
                       learning_rate=rand_params[idx][2], 
                       subsample=rand_params[idx][3],
                       min_child_weight=rand_params[idx][4],
                       colsample_bytree=rand_params[idx][5])

    score =  np.mean(cross_val_score(gbm_tuning, xtrain, ytrain, cv = kf, 
                           scoring = make_scorer(recall_score, average = 'macro'),
                          fit_params = params))

    tuning2.append((gbm_tuning.get_params, score))
    
    del(rand_params[idx])



## Finding best performing model from the random search

In [163]:
maxi = 0
for i in tuning:

    if i[1] > maxi:
        par = i[0]
        maxi = i[1]

In [184]:
maxi, par

(0.5976420379859918,
 <bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=0.8, gamma=None,
               gpu_id=None, importance_type='gain', interaction_constraints=None,
               learning_rate=0.25, max_delta_step=None, max_depth=3,
               min_child_weight=5, missing=nan, monotone_constraints=None,
               n_estimators=100, n_jobs=None, num_parallel_tree=None,
               objective='multi:softmax', random_state=None, reg_alpha=None,
               reg_lambda=None, scale_pos_weight=None, subsample=0.9,
               tree_method=None, validate_parameters=None, verbosity=None)>)

### Hand Tuning around best params

In [189]:
gbm_rand = xgb.XGBClassifier(n_estimators=100, 
                       max_depth=3,
                       objective="multi:softmax",  
                       learning_rate=0.25, 
                       subsample=0.9,
                       min_child_weight=5,
                       colsample_bytree=0.65)

score =  np.mean(cross_val_score(gbm_rand, xtrain, ytrain, cv = kf, 
                           scoring = make_scorer(recall_score, average = 'macro'),
                          fit_params = params))



In [190]:
score

0.5973675189341435

In [199]:
gbm_rand = xgb.XGBClassifier(n_estimators=100, 
                       max_depth=3,
                       objective="multi:softmax",  
                       learning_rate=0.25, 
                       subsample=0.9,
                       min_child_weight=6,
                       colsample_bytree=0.65)

score =  np.mean(cross_val_score(gbm_rand, xtrain, ytrain, cv = kf, 
                           scoring = make_scorer(recall_score, average = 'macro'),
                          fit_params = params))



In [200]:
score

0.5975193506674397

## Final tuned xgboost model not as good as final tuned random forest model

In [205]:
gbm_rand.fit(xtrain, ytrain, sample_weight = sample_weights, eval_metric = 'mlogloss',
         eval_set = ev, verbose = False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.65, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.25, max_delta_step=0, max_depth=3,
              min_child_weight=6, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [207]:
recall_score(ytest, gbm_rand.predict(xtest), average = 'macro')

0.6024880476429454