# Ensemble Classifiers

## Import required libraries

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25
import time
import pickle # allows for model to be saved/load to file
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pandas.api.types import CategoricalDtype
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


In [2]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

## Enable Algorithm Options

In [3]:
# Enable Algorithms
enable_model_xgboost = True
enable_model_randomForest = True
enable_multiclass_model = True
enable_mlp = True

predict_xgboost = True
predict_randomForest = True
predict_mlp = True

#Debug
verbose_level=1
#Multiclass classification, binary if falase
multiclass = False
over_sample = True

if multiclass:
    labels=[2, 1, 0]
else:
    labels=[1, 0]

# Datafile
#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test


#file_input = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'

model_max_iter = 1000
datestr = 'dec_10_binary_run_1000_BO_ensamble'

# Model File Names for storage
file_random_forest = 'random_forest_'  + datestr + '.model'
file_xgboost = 'xgboost_'  + datestr + '.model'
file_mlp = 'mlp_'  + datestr + '.model'

## Boosting

### XGBoost

In [4]:
#df = pd.read_csv(file_input, engine = 'python')

#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       7       1       2       2      22       1       3       2       1   
1       6       1       4       3      31       1       1       1       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       7      2      2      1       1       0  
1       7      1      4      1       2       1  
   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       2       1       2       1       2       4       1       1       2   
1       7       7       3       2      21       2       3       2       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       7      1      4      3       3       1  
1       1      2      3      1       1       0  


In [5]:
print('Train data set: {}'.format(df_train.shape))
print('Test data set: {}'.format(df_test.shape))

Train data set: (4986688, 15)
Test data set: (481616, 15)


In [6]:
# split data into X and y
#X = df.iloc[:,0:16]
#Y = df.iloc[:,-1]

In [7]:
#type(X)

In [8]:
#type(Y)

In [9]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

0
0


In [10]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0
0


In [11]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481616 entries, 0 to 481615
Data columns (total 15 columns):
C_MNTH    481616 non-null category
C_WDAY    481616 non-null category
C_HOUR    481616 non-null category
C_VEHS    481616 non-null category
C_CONF    481616 non-null category
C_RCFG    481616 non-null category
C_WTHR    481616 non-null category
C_RSUR    481616 non-null category
C_RALN    481616 non-null category
C_TRAF    481616 non-null category
P_SEX     481616 non-null category
P_AGE     481616 non-null category
P_PSN     481616 non-null category
P_USER    481616 non-null category
P_ISEV    481616 non-null int32
dtypes: category(14), int32(1)
memory usage: 8.3 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986688 entries, 0 to 4986687
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX  

In [12]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

print("Total Number of Rows : {}".format(total_test_Rows + total_train_Rows))

Number of Rows in test data: 481616
Number of Rows in train data: 4986688
Total Number of Rows : 5468304


In [13]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

In [14]:
# split data into X and y
X_train = df_train_cat.iloc[:,0:14]
Y_train = df_train_cat.iloc[:,-1]

# split data into X and y
X_test = df_test_cat.iloc[:,0:14]
Y_test = df_test_cat.iloc[:,-1]

In [15]:
#Split between data and class for training
#Y_train = df_train_cat[df_train_cat.columns[-1]]
#X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

#Y_test = df_test_cat[df_test_cat.columns[-1]]
#X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

In [16]:
print(Y_train.unique())

[1 0]


In [17]:
print(X_train.head(10))
print(X_test.head(10))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       2       1       2       1       2       4       1       1       2   
1       7       7       3       2      21       2       3       2       1   
2       6       6       3       3      21       2       1       1       1   
3       7       3       5       2      36       3       1       1       1   
4       6       2       4       2      21       2       1       1       1   
5       8       4       5       2      21       2       1       1       1   
6       6       5       2       2      36       2       1       1       2   
7       4       1       4       2      21       1       2       1       1   
8       7       1       1       2      21       1       1       1       1   
9       6       3       1       2      33       2       3       2       2   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  
0       7      1      4      3       3  
1       1      2      3      1       1  
2       1      1      5      

In [18]:
# fit model no training data
if (enable_model_xgboost):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))
    
    nX_train = np.array(X_train)
    nY_train = np.array(Y_train)
    
    #model = xgboost.XGBClassifier(silent=False, learning_rate = 0.3, n_jobs=10)
    model = xgboost.XGBClassifier(silent=False, learning_rate = 0.3, n_jobs=10)
    
    print(model)
    
    model.fit(nX_train, nY_train)
    
    # save model to file
    pickle.dump(model, open(file_xgboost, "wb"))
    
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))

Tue Dec 11 17:51:22 2018
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=10, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=False, subsample=1)
[17:51:23] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
[17:51:24] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:51:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:51:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:51:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra 

[17:52:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:15] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:16] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[17:52:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

In [19]:
X_test.isnull().sum()

C_MNTH    0
C_WDAY    0
C_HOUR    0
C_VEHS    0
C_CONF    0
C_RCFG    0
C_WTHR    0
C_RSUR    0
C_RALN    0
C_TRAF    0
P_SEX     0
P_AGE     0
P_PSN     0
P_USER    0
dtype: int64

### Predict XGBoost

In [20]:
nX_test = np.array(X_test)
nY_test = np.array(Y_test)
print(nX_test)

[[7 1 2 ... 2 1 1]
 [6 1 4 ... 4 1 2]
 [7 4 4 ... 1 2 2]
 ...
 [9 3 3 ... 3 1 1]
 [6 3 5 ... 3 1 1]
 [9 1 3 ... 4 1 1]]


In [21]:
#predictions for test data
if (predict_xgboost):
    # load model from file
    loaded_model = pickle.load(open(file_xgboost, "rb"))
    
    # make predictions for test data
    y_pred = loaded_model.predict(nX_test)
    
    #print(y_pred)
    #predictions = [round(value) for value in y_pred]
    # evaluate predictions
    #accuracy = accuracy_score(nY_test, predictions)
    #print("Accuracy: %.2f%%" % (accuracy * 100.0))
    
    print('Accuracy of XGB classifier on train set: {:.2f}'.format(loaded_model.score(nX_train, nY_train)))
    print('Accuracy of XGB classifier on test set: {:.2f}'.format(loaded_model.score(nX_test, nY_test)))
    
    print()
    print("XGBoost: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(nY_test, y_pred, labels=labels)
    print(cnf_matrix_lg)
    print()
    print("XGBoost: Classification Report")
    print(classification_report(nY_test, y_pred, labels=labels))
    print()
    print("Weighted Geometric Mean")
    gmean = geometric_mean_score(nY_test, y_pred, average='weighted')
    print(gmean)
    print()

  if diff:
  if diff:


Accuracy of XGB classifier on train set: 0.67


  if diff:


Accuracy of XGB classifier on test set: 0.66

XGBoost: Confusion Matrix
[[173049 103990]
 [ 57801 146776]]

XGBoost: Classification Report
             precision    recall  f1-score   support

          1       0.75      0.62      0.68    277039
          0       0.59      0.72      0.64    204577

avg / total       0.68      0.66      0.67    481616


Weighted Geometric Mean
0.6710129639182502



In [22]:
type(X_test)

pandas.core.frame.DataFrame

In [23]:
Grid_mlp = False
if (Grid_mlp):
    mlp = MLPClassifier(max_iter=100)
    
    param_grid = {
    'hidden_layer_sizes': [(25, 25, 25), (50,50,50), (50,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive']
    }
    
    # Instantiate the grid search model
    #grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, cv = 1, n_jobs = -1, verbose = 3)
    grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, n_jobs = -1, verbose = 3)
    
    # Fit the grid search to the data
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_

    best_grid = grid_search.best_estimator_
    grid_accuracy = evaluate(best_grid, X_test, Y_test)
    
    print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


In [24]:
RamdomMLP = False
if (RamdomMLP):
    # Create the random grid
    random_grid = {
        'hidden_layer_sizes': [(25, 25, 25), (50,50,50)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive']
    }
    print(random_grid)
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 1, verbose=3, random_state=42, n_jobs = -1)
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=3, random_state=42, n_jobs = -1)
    
    # Fit the random search model
    rf_random.fit(X_train, Y_train)

In [25]:
Grid_RF = False
if (Grid_RF):
    # Create the parameter grid based on the results of random search 
    param_grid = {
        'bootstrap': [True],
        'max_depth': [80, 90, 100, 110],
        'max_features': [2, 3],
        'min_samples_leaf': [3, 4, 5],
        'min_samples_split': [8, 10, 12],
        'n_estimators': [100, 200, 300, 1000]
        }
    # Create a based model
    mlp = MLPClassifier(max_iter=100)

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, cv = 1, n_jobs = -1, verbose = 3)

    # Fit the grid search to the data
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_

    best_grid = grid_search.best_estimator_
    grid_accuracy = evaluate(best_grid, X_test, Y_test)
    
    print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


In [26]:
#code from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

HPTune = False
if (HPTune):
    #from sklearn.model_selection import RandomizedSearchCV
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']

    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]

    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap
                  }
    print(random_grid)
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=3, random_state=42, n_jobs = -1)
    
    # Fit the random search model
    rf_random.fit(X_train, Y_train)
    ## Find the best parms for random Forest:
    best_par = rf_random.best_params_
    rf_accuracy = evaluate(best_par, X_test, Y_test)

## Bagging Ensemble

### Random Forest

In [27]:
# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())
# convert to the correct type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481616 entries, 0 to 481615
Data columns (total 15 columns):
C_MNTH    481616 non-null category
C_WDAY    481616 non-null category
C_HOUR    481616 non-null category
C_VEHS    481616 non-null category
C_CONF    481616 non-null category
C_RCFG    481616 non-null category
C_WTHR    481616 non-null category
C_RSUR    481616 non-null category
C_RALN    481616 non-null category
C_TRAF    481616 non-null category
P_SEX     481616 non-null category
P_AGE     481616 non-null category
P_PSN     481616 non-null category
P_USER    481616 non-null category
P_ISEV    481616 non-null int32
dtypes: category(14), int32(1)
memory usage: 8.3 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4986688 entries, 0 to 4986687
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX  

In [28]:
if (enable_model_randomForest):
    print("Ensemble (Bagging): Random Forest: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=0, n_jobs=-1, verbose=verbose_level)
    print()
    print(forest)
    
    pickle.dump(forest, open(file_random_forest, "wb"))
    
    print()
    print("Ensemble (Bagging): Random Forest: Fit")
    forest.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(forest, open(file_random_forest, "wb"))
    
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    print("Ensemble (Bagging): Random Forest: End")

Ensemble (Bagging): Random Forest: Start
Tue Dec 11 17:52:39 2018

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=1, warm_start=False)

Ensemble (Bagging): Random Forest: Fit


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.4min finished


Tue Dec 11 18:00:18 2018
Ensemble (Bagging): Random Forest: End


In [29]:
#predictions for test data
if (predict_randomForest):
    
    # load model from file
    loaded_model = pickle.load(open(file_random_forest, "rb"))
    
    print("Ensemble (Bagging): Random Forest: Predict")
    y_pred = forest.predict(X_test)
    
    print('Accuracy of RandomForest classifier on train set: {:.2f}'.format(forest.score(X_train, Y_train)))
    print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))
    
    print("Ensemble (Bagging): Random Forest: Confusion Matrix")
    cnf_matrix_rf = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_rf)
    
    print("Ensemble (Bagging): Random Forest: Classification Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    
    print()
    print("Weighted Geometric Mean")
    gmean = geometric_mean_score(Y_test, y_pred, average='weighted')
    print(gmean)
    print()

Ensemble (Bagging): Random Forest: Predict


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    7.6s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   29.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:  1.3min finished


Accuracy of RandomForest classifier on train set: 0.84


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    7.4s finished


Accuracy of RandomForest classifier on test set: 0.64
Ensemble (Bagging): Random Forest: Confusion Matrix
[[176534 100505]
 [ 74568 130009]]
Ensemble (Bagging): Random Forest: Classification Report
             precision    recall  f1-score   support

          1       0.70      0.64      0.67    277039
          0       0.56      0.64      0.60    204577

avg / total       0.64      0.64      0.64    481616


Weighted Geometric Mean
0.6363593318162607



In [30]:
##MLP

In [31]:
if enable_mlp:
    print("Multilayer Preceptron: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    #mlpc = MLPClassifier(alpha=1)
    #mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=model_max_iter, verbose=verbose_level)
    #mlpc = MLPClassifier(hidden_layer_sizes=(25, 25, 25), verbose=verbose_level, max_iter=model_max_iter)
    mlpc = MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50, 50), verbose=verbose_level, max_iter=model_max_iter, tol = 0.0001)
    print(mlpc)
    #mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
    print("Multilayer Preceptron: fit")
    mlpc.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(mlpc, open(file_mlp, "wb"))
    
if predict_mlp:
    
    # load model from file
    loaded_model = pickle.load(open(file_mlp, "rb"))
    print("Multilayer Preceptron: Predict")
    y_pred = mlpc.predict(X_test)

    print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
    print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))
    print()
    print("Multilayer Preceptron: Confusion Matrix")
    cnf_matrix_mlp = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_mlp)
    print()
    print("Multilayer Preceptron: Classificiation Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    (print)
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("Multilayer Preceptron: End")

Multilayer Preceptron: Start
Tue Dec 11 18:02:53 2018
MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=1, warm_start=False)
Multilayer Preceptron: fit
Iteration 1, loss = 0.60519821
Iteration 2, loss = 0.59852458
Iteration 3, loss = 0.59706963
Iteration 4, loss = 0.59629244
Iteration 5, loss = 0.59566965
Iteration 6, loss = 0.59522384
Iteration 7, loss = 0.59487311
Iteration 8, loss = 0.59464032
Iteration 9, loss = 0.59445995
Iteration 10, loss = 0.59427484
Iteration 11, loss = 0.59414259
Iteration 12, loss = 0.59400456
Iteration 13, loss = 0.59392907
Iteration 14, loss = 0.59381274
Iteration 15, loss = 0.59364204
Iteration 1

### voting classifier

In [32]:
# Voting Ensemble for Classification
voting = True
if (voting):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))
    
    nX_train = np.array(X_train)
    nY_train = np.array(Y_train)
    nX_test = np.array(X_test)
    nY_test = np.array(Y_test)
    
    estimators = []
    model1 = MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50, 50), verbose=verbose_level, max_iter=model_max_iter, tol = 0.0001)
    estimators.append(('MLP', model1))
    model2 = xgboost.XGBClassifier(learning_rate = 0.3, silent=False, n_jobs=-1)
    estimators.append(('XGB', model2))
    model3 = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=0, n_jobs=-1, verbose=verbose_level)
    estimators.append(('RandomForest', model3))
    
    print(model1)
    print(model2)
    print(model3)
    
    # create the ensemble model
    ensemble = VotingClassifier(estimators)
    ensemble.fit(nX_train, nY_train)
    y_pred = ensemble.predict(nX_test)
    
    print('Accuracy of Voting classifier on train set: {:.2f}'.format(ensemble.score(nX_train, nY_train)))
    print('Accuracy of Voting classifier on test set: {:.2f}'.format(ensemble.score(nX_test, nY_test)))
        
    print()
    print("Voting: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(nY_test, y_pred, labels=labels)
    print(cnf_matrix_lg)
    print()
    print("Voting: Classification Report")
    print(classification_report(nY_test, y_pred, labels=labels))
    print()
    print("Weighted Geometric Mean")
    gmean = geometric_mean_score(nY_test, y_pred, average='weighted')
    print(gmean)
    print()
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))

Tue Dec 11 18:09:26 2018
MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=1, warm_start=False)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=False, subsample=1)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,

[18:16:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:45] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[18:16:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.4min finished
  if diff:
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    7.3s finished
  if diff:
  if diff:
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   29.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:  1.3min finished
  if diff:


Accuracy of Voting classifier on train set: 0.69


  if diff:
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    7.3s finished
  if diff:


Accuracy of Voting classifier on test set: 0.66

Voting: Confusion Matrix
[[171583 105456]
 [ 56178 148399]]

Voting: Classification Report
             precision    recall  f1-score   support

          1       0.75      0.62      0.68    277039
          0       0.58      0.73      0.65    204577

avg / total       0.68      0.66      0.67    481616


Weighted Geometric Mean
0.6723228507223615

Tue Dec 11 18:24:38 2018
