# Ensemble Classifiers

## Import required libraries

In [84]:
import numpy as np
import pandas as pd
import time
import pickle # allows for model to be saved/load to file
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Enable Algorithm Options

In [85]:
# Enable Algorithms
enable_model_xgboost = True
enable_model_randomForest = True
enable_multiclass_model = True

predict_xgboost = True
predict_randomForest = True


#Debug
verbose_level=1
#Multiclass classification, binary if falase
multiclass = False

# Datafile
file_input = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'

# Model File Names for storage
file_random_forest = 'random_forest_nov_01.model'
file_xgboost = 'xgboost_nov_01.model'

## Boosting

### XGBoost

In [86]:
df = pd.read_csv(file_input, engine = 'python')

In [87]:
df.shape

(3655334, 17)

In [88]:
# split data into X and y
X = df.iloc[:,0:16]
Y = df.iloc[:,-1]

In [89]:
type(X)

pandas.core.frame.DataFrame

In [90]:
type(Y)

pandas.core.series.Series

In [91]:
seed = 10
test_size = 0.33
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

In [92]:
# fit model no training data
if (enable_model_xgboost):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))

    model = xgboost.XGBClassifier(silent=False, n_jobs=10)
    model.fit(X_train, y_train)

    # save model to file
    pickle.dump(model, open('xgboost_nov_01.model', "wb"))
    
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))

Fri Nov 30 00:03:59 2018
[00:04:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:04] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra no

[00:04:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:04:33] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[00:04:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[00:05:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:31] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:32] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[00:05:58] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:05:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[00:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
Fri Nov 30 00:06:00 2018


### Predict XGBoost

In [93]:
#predictions for test data
if (predict_xgboost):
    # load model from file
    loaded_model = pickle.load(open('xgboost_nov_01.model', "rb"))
    
    # make predictions for test data
    y_pred = loaded_model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 64.52%


## Bagging Ensemble

### Random Forest

In [94]:
if (enable_model_randomForest):
    print("Ensemble (Bagging): Random Forest: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(criterion='entropy', n_estimators=50, random_state=0, n_jobs=10, verbose=verbose_level)
    print("Ensemble (Bagging): Random Forest: Fit")
    forest.fit(X_train, y_train)
    
    # save model to file
    pickle.dump(model, open('random_forest_nov_01.model', "wb"))
    
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    print("Ensemble (Bagging): Random Forest: End")

Ensemble (Bagging): Random Forest: Start
Fri Nov 30 00:06:03 2018
Ensemble (Bagging): Random Forest: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.1min


Fri Nov 30 00:07:51 2018
Ensemble (Bagging): Random Forest: End


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:  1.8min finished


In [95]:
#predictions for test data
if (predict_randomForest):
    
    # load model from file
    loaded_model = pickle.load(open('random_forest_nov_01.model', "rb"))
    
    print("Ensemble (Bagging): Random Forest: Predict")
    y_pred = forest.predict(X_test)
    
    print('Accuracy of RandomForest classifier on train set: {:.2f}'.format(forest.score(X_train, y_train)))
    print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, y_test)))
    
    print("Ensemble (Bagging): Random Forest: Confusion Matrix")
    cnf_matrix_rf = confusion_matrix(y_test, y_pred)
    print(cnf_matrix_rf)
    
    print("Ensemble (Bagging): Random Forest: Classification Report")
    print(classification_report(y_test,y_pred))

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


Ensemble (Bagging): Random Forest: Predict


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    7.2s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:   10.8s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   13.4s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:   22.1s finished


Accuracy of RandomForest classifier on train set: 1.00


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    7.2s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:   10.7s finished


Accuracy of RandomForest classifier on test set: 0.64
Ensemble (Bagging): Random Forest: Confusion Matrix
[[288687 229561     85]
 [201006 479113    302]
 [  1041   6382     84]]
Ensemble (Bagging): Random Forest: Classification Report
              precision    recall  f1-score   support

           1       0.59      0.56      0.57    518333
           2       0.67      0.70      0.69    680421
           3       0.18      0.01      0.02      7507

   micro avg       0.64      0.64      0.64   1206261
   macro avg       0.48      0.42      0.43   1206261
weighted avg       0.63      0.64      0.63   1206261

