# Ensemble Classifiers

## Import required libraries

In [29]:
import numpy as np
import pandas as pd
import time
import pickle # allows for model to be saved/load to file
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pandas.api.types import CategoricalDtype

## Enable Algorithm Options

In [30]:
# Enable Algorithms
enable_model_xgboost = True
enable_model_randomForest = True
enable_multiclass_model = True

predict_xgboost = True
predict_randomForest = True


#Debug
verbose_level=1
#Multiclass classification, binary if falase
multiclass = False
over_sample = True

# Datafile
#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test


#file_input = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'

model_max_iter = 1000
datestr = 'dec_06_binary_run_1000_BO'

# Model File Names for storage
file_random_forest = 'random_forest_'  + datestr + '.model'
file_xgboost = 'xgboost_'  + datestr + '.model'

## Boosting

### XGBoost

In [31]:
#df = pd.read_csv(file_input, engine = 'python')

#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       5       2       1       2      21       1       3       2       1   
1       1       1       1       2      21       1       1       1       1   

   C_TRAF  P_SEX     P_AGE  P_PSN  P_USER  P_ISEV  
0       3      1  0.326531      1       1       1  
1       1      0  0.316327      1       1       1  
   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0      12       7       4       3      51       1       1       1       1   
1      11       6       3       3      21       2       1       1       2   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       1      1    0.0      1       1       1  
1       1      0    0.0      2       2       0  


In [32]:
df.shape

(3361538, 15)

In [33]:
# split data into X and y
#X = df.iloc[:,0:16]
#Y = df.iloc[:,-1]

In [34]:
#type(X)

In [35]:
#type(Y)

In [36]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

0
0


In [37]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

1235072
3361538


In [38]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

df_test_cat = df_test_cat.astype('category')
df_train_cat = df_train_cat.astype('category')

# convert to the correct type
df_train_cat['C_MNTH'] = df_train_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_WDAY'] = df_train_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_HOUR'] = df_train_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_VEHS'] = df_train_cat['C_VEHS'].astype(CategoricalDtype(ordered=True))
#df_train_cat['V_YEAR'] = df_train_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_PSN'] = df_train_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_AGE'] = df_train_cat['P_AGE'].astype('float64')
df_train_cat['P_ISEV'] = df_train_cat['P_ISEV'].astype('int')

print(df_train_cat.info())

# convert to the correct type
df_test_cat['C_MNTH'] = df_test_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_WDAY'] = df_test_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_HOUR'] = df_test_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_VEHS'] = df_test_cat['C_VEHS'].astype(CategoricalDtype(ordered=True))
#df_test_cat['V_YEAR'] = df_test_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_PSN'] = df_test_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_AGE'] = df_test_cat['P_AGE'].astype('float64')
df_test_cat['P_ISEV'] = df_test_cat['P_ISEV'].astype('int')
print(df_test_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3361538 entries, 0 to 3361537
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     float64
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(13), float64(1), int32(1)
memory usage: 80.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235072 entries, 0 to 1235071
Data columns (total 15 columns):
C_MNTH    1235072 non-null category
C_WDAY    1235072 non-null category
C_HOUR    1235072 non-null category
C_VEHS    1235072 non-null category
C_CONF    1235072 non-null category
C_RCFG    1235072 non-null category
C_WTHR    1235072 non-null category
C_RSUR    1235072 non-null category
C_RALN    1235072 non-null category
C_TRAF    1235072 non-null category
P_SEX     1235072 non-null category
P_AGE     1235072 non-null 

In [39]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

Number of Rows in test data: 1235072
Number of Rows in train data: 3361538


In [40]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

In [41]:
# split data into X and y
X_train = df_train_cat.iloc[:,0:18]
Y_train = df_train_cat.iloc[:,-1]

# split data into X and y
X_test = df_test_cat.iloc[:,0:18]
Y_test = df_test_cat.iloc[:,-1]

In [42]:
#Split between data and class for training
#Y_train = df_train_cat[df_train_cat.columns[-1]]
#X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

#Y_test = df_test_cat[df_test_cat.columns[-1]]
#X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

In [44]:
print(Y_train.unique())
print(Y_test.unique())

[1 0]
[1 0]


In [16]:
print(X_train.head(3))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0      12       7       4       3      51       1       1       1       1   
1      11       6       3       3      21       2       1       1       2   
2       2       5       1       2       1       1       1       1       3   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       1      1      0      1       1       1  
1       1      0      0      2       2       0  
2       7      1      0      1       2       0  


In [17]:
#seed = 10
#test_size = 0.33
#X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

In [45]:
# fit model no training data
if (enable_model_xgboost):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))
    
    nX_train = np.array(X_train)
    nY_train = np.array(Y_train)
    
    model = xgboost.XGBClassifier(silent=False, n_jobs=10)
    #model.fit(X_train, Y_train)
    model.fit(nX_train, nY_train)
    
    # save model to file
    pickle.dump(model, open(file_xgboost, "wb"))
    
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))

Fri Dec  7 09:18:57 2018
[09:18:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:18:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:18:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pr

[09:19:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[09:19:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[

In [19]:
X_test.isnull().sum()

C_MNTH    0
C_WDAY    0
C_HOUR    0
C_VEHS    0
C_CONF    0
C_RCFG    0
C_WTHR    0
C_RSUR    0
C_RALN    0
C_TRAF    0
P_SEX     0
P_AGE     0
P_PSN     0
P_USER    0
P_ISEV    0
dtype: int64

### Predict XGBoost

In [46]:
nX_test = np.array(X_test)
nY_test = np.array(Y_test)
print(nX_test)

[[5 2 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [7 2 3 ... 1 1 1]
 ...
 [2 1 3 ... 1 1 1]
 [9 5 4 ... 1 2 0]
 [6 7 4 ... 1 2 0]]


In [47]:
#predictions for test data
if (predict_xgboost):
    # load model from file
    loaded_model = pickle.load(open(file_xgboost, "rb"))
    
    # make predictions for test data
    y_pred = loaded_model.predict(nX_test)
    print(y_pred)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(nY_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

  if diff:


[1 1 1 ... 1 0 0]
Accuracy: 100.00%


In [22]:
type(X_test)

pandas.core.frame.DataFrame

## Bagging Ensemble

### Random Forest

In [26]:
df_test_cat = df_test_cat.astype('category')
df_train_cat = df_train_cat.astype('category')

# convert to the correct type
df_train_cat['C_MNTH'] = df_train_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_WDAY'] = df_train_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_HOUR'] = df_train_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_VEHS'] = df_train_cat['C_VEHS'].astype(CategoricalDtype(ordered=True))
#df_train_cat['V_YEAR'] = df_train_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_PSN'] = df_train_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_AGE'] = df_train_cat['P_AGE'].astype('float64')
df_train_cat['P_ISEV'] = df_train_cat['P_ISEV'].astype('int')

print(df_train_cat.info())

# convert to the correct type
df_test_cat['C_MNTH'] = df_test_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_WDAY'] = df_test_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_HOUR'] = df_test_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_VEHS'] = df_test_cat['C_VEHS'].astype(CategoricalDtype(ordered=True))
#df_test_cat['V_YEAR'] = df_test_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_PSN'] = df_test_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_AGE'] = df_test_cat['P_AGE'].astype('float64')
df_test_cat['P_ISEV'] = df_test_cat['P_ISEV'].astype('int')
print(df_test_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3361538 entries, 0 to 3361537
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     float64
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(13), float64(1), int32(1)
memory usage: 80.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235072 entries, 0 to 1235071
Data columns (total 15 columns):
C_MNTH    1235072 non-null category
C_WDAY    1235072 non-null category
C_HOUR    1235072 non-null category
C_VEHS    1235072 non-null category
C_CONF    1235072 non-null category
C_RCFG    1235072 non-null category
C_WTHR    1235072 non-null category
C_RSUR    1235072 non-null category
C_RALN    1235072 non-null category
C_TRAF    1235072 non-null category
P_SEX     1235072 non-null category
P_AGE     1235072 non-null 

In [27]:
if (enable_model_randomForest):
    print("Ensemble (Bagging): Random Forest: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(criterion='entropy', n_estimators=50, random_state=0, n_jobs=10, verbose=verbose_level)
    print("Ensemble (Bagging): Random Forest: Fit")
    forest.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(model, open(file_random_forest, "wb"))
    
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    print("Ensemble (Bagging): Random Forest: End")

Ensemble (Bagging): Random Forest: Start
Fri Dec  7 08:24:29 2018
Ensemble (Bagging): Random Forest: Fit


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   21.6s


Fri Dec  7 08:25:03 2018
Ensemble (Bagging): Random Forest: End


[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:   33.3s finished


In [28]:
#predictions for test data
if (predict_randomForest):
    
    # load model from file
    loaded_model = pickle.load(open(file_random_forest, "rb"))
    
    print("Ensemble (Bagging): Random Forest: Predict")
    y_pred = forest.predict(X_test)
    
    print('Accuracy of RandomForest classifier on train set: {:.2f}'.format(forest.score(X_train, Y_train)))
    print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))
    
    print("Ensemble (Bagging): Random Forest: Confusion Matrix")
    cnf_matrix_rf = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_rf)
    
    print("Ensemble (Bagging): Random Forest: Classification Report")
    print(classification_report(Y_test,y_pred))

Ensemble (Bagging): Random Forest: Predict


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    1.1s finished
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.9s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    3.0s finished


Accuracy of RandomForest classifier on train set: 1.00


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    1.1s finished


Accuracy of RandomForest classifier on test set: 1.00
Ensemble (Bagging): Random Forest: Confusion Matrix
[[514742      0]
 [     0 720330]]
Ensemble (Bagging): Random Forest: Classification Report
             precision    recall  f1-score   support

          0       1.00      1.00      1.00    514742
          1       1.00      1.00      1.00    720330

avg / total       1.00      1.00      1.00   1235072

