# Ensemble Classifiers

## Import required libraries

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 25
import time
import pickle # allows for model to be saved/load to file
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from pandas.api.types import CategoricalDtype
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [None]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

## Enable Algorithm Options

In [None]:
# Enable Algorithms
enable_model_xgboost = True
enable_model_randomForest = True
enable_multiclass_model = True

predict_xgboost = True
predict_randomForest = True


#Debug
verbose_level=1
#Multiclass classification, binary if falase
multiclass = False
over_sample = True

if multiclass:
    labels=[2, 1, 0]
else:
    labels=[1, 0]

# Datafile
#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test


#file_input = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'

model_max_iter = 100
datestr = 'dec_09_binary_run_1000_BO_ensamble'

# Model File Names for storage
file_random_forest = 'random_forest_'  + datestr + '.model'
file_xgboost = 'xgboost_'  + datestr + '.model'

## Boosting

### XGBoost

In [None]:
#df = pd.read_csv(file_input, engine = 'python')

#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

In [None]:
print('Train data set: {}'.format(df_train.shape))
print('Test data set: {}'.format(df_test.shape))

In [None]:
# split data into X and y
#X = df.iloc[:,0:16]
#Y = df.iloc[:,-1]

In [None]:
#type(X)

In [None]:
#type(Y)

In [None]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

In [None]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

In [None]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())

In [None]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

print("Total Number of Rows : {}".format(total_test_Rows + total_train_Rows))

In [None]:
df_test_cat = df_test.astype('int').copy()
df_train_cat = df_train.astype('int').copy()

In [None]:
# split data into X and y
X_train = df_train_cat.iloc[0:100000,0:14]
Y_train = df_train_cat.iloc[:,-1]

# split data into X and y
X_test = df_test_cat.iloc[:,0:14]
Y_test = df_test_cat.iloc[:,-1]

In [None]:
#Split between data and class for training
#Y_train = df_train_cat[df_train_cat.columns[-1]]
#X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

#Y_test = df_test_cat[df_test_cat.columns[-1]]
#X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

In [None]:
print(Y_train.unique())

In [None]:
print(X_train.head(10))
print(X_test.head(10))

In [None]:
# fit model no training data
if (enable_model_xgboost):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))
    
    nX_train = np.array(X_train)
    nY_train = np.array(Y_train)
    
    model = xgboost.XGBClassifier(silent=False, n_jobs=10)
    
    print(model)
    
    model.fit(nX_train, nY_train)
    
    # save model to file
    pickle.dump(model, open(file_xgboost, "wb"))
    
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))

In [None]:
X_test.isnull().sum()

### Predict XGBoost

In [None]:
nX_test = np.array(X_test)
nY_test = np.array(Y_test)
print(nX_test)

In [None]:
#predictions for test data
if (predict_xgboost):
    # load model from file
    loaded_model = pickle.load(open(file_xgboost, "rb"))
    
    # make predictions for test data
    y_pred = loaded_model.predict(nX_test)
    
    #print(y_pred)
    #predictions = [round(value) for value in y_pred]
    # evaluate predictions
    #accuracy = accuracy_score(nY_test, predictions)
    #print("Accuracy: %.2f%%" % (accuracy * 100.0))
    
    print('Accuracy of XGB classifier on train set: {:.2f}'.format(ensemble.score(nX_train, nY_train)))
    print('Accuracy of XGB classifier on test set: {:.2f}'.format(ensemble.score(nX_test, nY_test)))
    
    print()
    print("XGBoost: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(nY_test, y_pred, labels=labels)
    print(cnf_matrix_lg)
    print()
    print("XGBoost: Classification Report")
    print(classification_report(nY_test, y_pred, labels=labels))
    print()
    print("Weighted Geometric Mean")
    gmean = geometric_mean_score(nY_test, y_pred, average='weighted')
    print(gmean)
    print()

In [None]:
type(X_test)

In [None]:
Grid_mlp = False
if (Grid_mlp):
    mlp = MLPClassifier(max_iter=100)
    
    param_grid = {
    'hidden_layer_sizes': [(25, 25, 25), (50,50,50), (50,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive']
    }
    
    # Instantiate the grid search model
    #grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, cv = 1, n_jobs = -1, verbose = 3)
    grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, n_jobs = -1, verbose = 3)
    
    # Fit the grid search to the data
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_

    best_grid = grid_search.best_estimator_
    grid_accuracy = evaluate(best_grid, X_test, Y_test)
    
    print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


In [None]:
RamdomMLP = False
if (RamdomMLP):
    # Create the random grid
    random_grid = {
        'hidden_layer_sizes': [(25, 25, 25), (50,50,50), (50,100,50)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive']
    }
    print(random_grid)
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 1, verbose=3, random_state=42, n_jobs = -1)
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, verbose=3, random_state=42, n_jobs = -1)
    
    # Fit the random search model
    rf_random.fit(X_train, Y_train)

In [None]:
Grid_RF = False
if (Grid_RF):
    # Create the parameter grid based on the results of random search 
    param_grid = {
        'bootstrap': [True],
        'max_depth': [80, 90, 100, 110],
        'max_features': [2, 3],
        'min_samples_leaf': [3, 4, 5],
        'min_samples_split': [8, 10, 12],
        'n_estimators': [100, 200, 300, 1000]
        }
    # Create a based model
    mlp = MLPClassifier(max_iter=100)

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, cv = 1, n_jobs = -1, verbose = 3)

    # Fit the grid search to the data
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_

    best_grid = grid_search.best_estimator_
    grid_accuracy = evaluate(best_grid, X_test, Y_test)
    
    print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


In [None]:
#code from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

HPTune = False
if (HPTune):
    from sklearn.model_selection import RandomizedSearchCV
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']

    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)

    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]

    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]

    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap
                  }
    print(random_grid)
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 1, verbose=3, random_state=42, n_jobs = -1)
    
    # Fit the random search model
    rf_random.fit(X_train, Y_train)

In [None]:
## Find the best parms for random Forest:
best_par = rf_random.best_params_
rf_accuracy = evaluate(best_par, X_test, Y_test)

## Bagging Ensemble

### Random Forest

In [None]:
# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())
# convert to the correct type

In [None]:
if (enable_model_randomForest):
    print("Ensemble (Bagging): Random Forest: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    forest = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=0, n_jobs=10, verbose=verbose_level)
    print()
    print(forest)
    print()
    print("Ensemble (Bagging): Random Forest: Fit")
    forest.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(model, open(file_random_forest, "wb"))
    
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    print("Ensemble (Bagging): Random Forest: End")

In [None]:
#predictions for test data
if (predict_randomForest):
    
    # load model from file
    loaded_model = pickle.load(open(file_random_forest, "rb"))
    
    print("Ensemble (Bagging): Random Forest: Predict")
    y_pred = forest.predict(X_test)
    
    print('Accuracy of RandomForest classifier on train set: {:.2f}'.format(forest.score(X_train, Y_train)))
    print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))
    
    print("Ensemble (Bagging): Random Forest: Confusion Matrix")
    cnf_matrix_rf = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_rf)
    
    print("Ensemble (Bagging): Random Forest: Classification Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    
    print()
    print("Weighted Geometric Mean")
    gmean = geometric_mean_score(Y_test, y_pred, average='weighted')
    print(gmean)
    print()

### voting classifier

In [None]:
# Voting Ensemble for Classification
voting = True
if (voting):
    t_ =  time.time()
    print(time.asctime( time.localtime(t_) ))
    
    nX_train = np.array(X_train)
    nY_train = np.array(Y_train)
    nX_test = np.array(X_test)
    nY_test = np.array(Y_test)
    
    estimators = []
    model1 = MLPClassifier(hidden_layer_sizes=(12, 12, 12), verbose=verbose_level, max_iter=model_max_iter, tol = 0.0001)
    estimators.append(('MLP', model1))
    model2 = xgboost.XGBClassifier(silent=False, n_jobs=10)
    estimators.append(('XGB', model2))
    model3 = RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=0, n_jobs=10, verbose=verbose_level)
    estimators.append(('RandomForest', model3))
    
    print(model1)
    print(model2)
    print(model3)
    
    # create the ensemble model
    ensemble = VotingClassifier(estimators)
    ensemble.fit(nX_train, nY_train)
    y_pred = ensemble.predict(nX_test)
    
    print('Accuracy of Voting classifier on train set: {:.2f}'.format(ensemble.score(nX_train, nY_train)))
    print('Accuracy of Voting classifier on test set: {:.2f}'.format(ensemble.score(nX_test, nY_test)))
        
    print()
    print("XGBoost: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(nY_test, y_pred, labels=labels)
    print(cnf_matrix_lg)
    print()
    print("XGBoost: Classification Report")
    print(classification_report(nY_test, y_pred, labels=labels))
    print()
    print("Weighted Geometric Mean")
    gmean = geometric_mean_score(nY_test, y_pred, average='weighted')
    print(gmean)
    print()
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))