# Ensemble Classifiers

## Import required libraries

In [26]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25
import time
import pickle # allows for model to be saved/load to file
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pandas.api.types import CategoricalDtype

In [27]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

## Enable Algorithm Options

In [31]:
# Enable Algorithms
enable_model_xgboost = True
enable_model_randomForest = True
enable_multiclass_model = True

predict_xgboost = True
predict_randomForest = True


#Debug
verbose_level=1
#Multiclass classification, binary if falase
multiclass = False
over_sample = True

# Datafile
#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test


#file_input = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'

model_max_iter = 1000
datestr = 'dec_07_binary_run_1000_BO'

# Model File Names for storage
file_random_forest = 'random_forest_'  + datestr + '.model'
file_xgboost = 'xgboost_'  + datestr + '.model'

## Boosting

### XGBoost

In [29]:
#df = pd.read_csv(file_input, engine = 'python')

#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       8       5       2       2      36       2       1       1       1   
1       6       5       2       2      33       2       1       1       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       2      2      1      1       3       1  
1       1      1      4      1       2       0  
   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       6       2       4       2      23       1       1       1       1   
1       2       6       1       1       3       1       4       6       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       7      2      4      1       1       1  
1       7      1      4      1       1       1  


In [30]:
print('Train data set: {}'.format(df_train.shape))
print('Test data set: {}'.format(df_test.shape))

Train data set: (3878536, 15)
Test data set: (1444846, 15)


In [6]:
# split data into X and y
#X = df.iloc[:,0:16]
#Y = df.iloc[:,-1]

In [7]:
#type(X)

In [8]:
#type(Y)

In [32]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

0
0


In [33]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0
0


In [34]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444846 entries, 0 to 1444845
Data columns (total 15 columns):
C_MNTH    1444846 non-null category
C_WDAY    1444846 non-null category
C_HOUR    1444846 non-null category
C_VEHS    1444846 non-null category
C_CONF    1444846 non-null category
C_RCFG    1444846 non-null category
C_WTHR    1444846 non-null category
C_RSUR    1444846 non-null category
C_RALN    1444846 non-null category
C_TRAF    1444846 non-null category
P_SEX     1444846 non-null category
P_AGE     1444846 non-null category
P_PSN     1444846 non-null category
P_USER    1444846 non-null category
P_ISEV    1444846 non-null int32
dtypes: category(14), int32(1)
memory usage: 24.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3878536 entries, 0 to 3878535
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF  

In [35]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

print("Total Number of Rows : {}".format(total_test_Rows + total_train_Rows))

Number of Rows in test data: 1444846
Number of Rows in train data: 3878536
Total Number of Rows : 5323382


In [36]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

In [37]:
# split data into X and y
X_train = df_train_cat.iloc[:,0:14]
Y_train = df_train_cat.iloc[:,-1]

# split data into X and y
X_test = df_test_cat.iloc[:,0:14]
Y_test = df_test_cat.iloc[:,-1]

In [38]:
#Split between data and class for training
#Y_train = df_train_cat[df_train_cat.columns[-1]]
#X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

#Y_test = df_test_cat[df_test_cat.columns[-1]]
#X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

In [39]:
print(Y_train.unique())

[1 0]


In [40]:
print(X_train.head(10))
print(X_test.head(10))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       6       2       4       2      23       1       1       1       1   
1       2       6       1       1       3       1       4       6       1   
2       4       4       2       2      36       2       4       4       1   
3       2       4       3       2      35       2       1       1       1   
4      10       7       3       2       6       2       3       2       1   
5       6       5       3       2      21       2       2       2       1   
6       5       7       3       4      21       1       1       1       1   
7       4       1       2       2      35       3       1       1       1   
8       9       5       3       2      24       1       1       1       2   
9      11       2       3       3      51       1       2       1       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  
0       7      2      4      1       1  
1       7      1      4      1       1  
2       2      2      4      

In [41]:
# fit model no training data
if (enable_model_xgboost):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))
    
    nX_train = np.array(X_train)
    nY_train = np.array(Y_train)
    
    model = xgboost.XGBClassifier(silent=False, n_jobs=10)
    model.fit(nX_train, nY_train)
    
    # save model to file
    pickle.dump(model, open(file_xgboost, "wb"))
    
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))

Fri Dec  7 16:45:49 2018
[16:45:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:45:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra no

[16:46:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:36] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:37] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:46:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

In [42]:
X_test.isnull().sum()

C_MNTH    0
C_WDAY    0
C_HOUR    0
C_VEHS    0
C_CONF    0
C_RCFG    0
C_WTHR    0
C_RSUR    0
C_RALN    0
C_TRAF    0
P_SEX     0
P_AGE     0
P_PSN     0
P_USER    0
dtype: int64

### Predict XGBoost

In [43]:
nX_test = np.array(X_test)
nY_test = np.array(Y_test)
print(nX_test)

[[ 8  5  2 ...  1  1  3]
 [ 6  5  2 ...  4  1  2]
 [ 5  6  3 ...  4  1  1]
 ...
 [10  5  5 ...  1  1  2]
 [12  2  4 ...  5  1  1]
 [ 6  5  4 ...  3  1  1]]


In [44]:
#predictions for test data
if (predict_xgboost):
    # load model from file
    loaded_model = pickle.load(open(file_xgboost, "rb"))
    
    # make predictions for test data
    y_pred = loaded_model.predict(nX_test)
    print(y_pred)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(nY_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

  if diff:


[1 1 0 ... 0 1 1]
Accuracy: 65.94%


In [45]:
type(X_test)

pandas.core.frame.DataFrame

## Bagging Ensemble

### Random Forest

In [46]:
# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())
# convert to the correct type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444846 entries, 0 to 1444845
Data columns (total 15 columns):
C_MNTH    1444846 non-null category
C_WDAY    1444846 non-null category
C_HOUR    1444846 non-null category
C_VEHS    1444846 non-null category
C_CONF    1444846 non-null category
C_RCFG    1444846 non-null category
C_WTHR    1444846 non-null category
C_RSUR    1444846 non-null category
C_RALN    1444846 non-null category
C_TRAF    1444846 non-null category
P_SEX     1444846 non-null category
P_AGE     1444846 non-null category
P_PSN     1444846 non-null category
P_USER    1444846 non-null category
P_ISEV    1444846 non-null int32
dtypes: category(14), int32(1)
memory usage: 24.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3878536 entries, 0 to 3878535
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF  

In [47]:
if (enable_model_randomForest):
    print("Ensemble (Bagging): Random Forest: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=0, n_jobs=10, verbose=verbose_level)
    print("Ensemble (Bagging): Random Forest: Fit")
    forest.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(model, open(file_random_forest, "wb"))
    
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    print("Ensemble (Bagging): Random Forest: End")

Ensemble (Bagging): Random Forest: Start
Fri Dec  7 16:46:54 2018
Ensemble (Bagging): Random Forest: Fit


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.6min


Fri Dec  7 16:51:26 2018
Ensemble (Bagging): Random Forest: End


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  4.5min finished


In [48]:
#predictions for test data
if (predict_randomForest):
    
    # load model from file
    loaded_model = pickle.load(open(file_random_forest, "rb"))
    
    print("Ensemble (Bagging): Random Forest: Predict")
    y_pred = forest.predict(X_test)
    
    print('Accuracy of RandomForest classifier on train set: {:.2f}'.format(forest.score(X_train, Y_train)))
    print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))
    
    print("Ensemble (Bagging): Random Forest: Confusion Matrix")
    cnf_matrix_rf = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_rf)
    
    print("Ensemble (Bagging): Random Forest: Classification Report")
    print(classification_report(Y_test,y_pred))

Ensemble (Bagging): Random Forest: Predict


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    7.9s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:   25.1s finished
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   21.6s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:  1.1min finished


Accuracy of RandomForest classifier on train set: 0.85


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    7.9s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:   24.1s finished


Accuracy of RandomForest classifier on test set: 0.63
Ensemble (Bagging): Random Forest: Confusion Matrix
[[387567 226164]
 [301451 529664]]
Ensemble (Bagging): Random Forest: Classification Report
             precision    recall  f1-score   support

          0       0.56      0.63      0.59    613731
          1       0.70      0.64      0.67    831115

avg / total       0.64      0.63      0.64   1444846

