# Training

In [16]:
import os
import sys
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn.naive_bayes
import sklearn.model_selection
import sklearn.tree
from sklearn.model_selection import GridSearchCV
import sklearn.preprocessing as preprocessing
import sklearn.pipeline
import sklearn.decomposition
import sklearn.neighbors
import sklearn.svm
import sklearn.ensemble
from sklearn.utils import resample
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from collections import Counter
from sklearn.datasets import make_classification
# need to pip install import_ipynb
import import_ipynb
# need to pip install -U imbalanced-learn
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline
from data_preperation import features, labels

# at this point, data should be clean 
print(features.head())


   congress  bill sponsor_party sponsor_state  cosponsors  r_cosponsors  \
0       113     1             R            MI           0             0   
1       113     2             R            NE          15            15   
2       113     3             R            NE         134           132   
3       113     4             R            MI           4             4   
4       113     5             R            MN          12            12   

   d_cosponsors                       subject  withdrawn_cosponsors  \
0             0                      Taxation                     0   
1             0                        Energy                     0   
2             2                        Energy                     1   
3             0  Economics and Public Finance                     0   
4             0                     Education                     0   

   committees  subcommittees  actions  summary_words  
0           1              0        1           2854  
1           

# Feature Engineering 

In [17]:
# instantiate encoder
le = preprocessing.LabelEncoder()

# fit and transform to all columns
features = features.apply(le.fit_transform)
enc = preprocessing.OneHotEncoder()

# fit
enc.fit_transform(features)

features.head()

Unnamed: 0,congress,bill,sponsor_party,sponsor_state,cosponsors,r_cosponsors,d_cosponsors,subject,withdrawn_cosponsors,committees,subcommittees,actions,summary_words
0,0,0,1,24,0,0,0,30,0,1,0,0,933
1,0,1,1,32,15,15,0,11,0,5,2,23,1033
2,0,2,1,32,134,130,2,11,1,3,4,54,492
3,0,3,1,24,4,4,0,8,0,9,1,25,1020
4,0,4,1,25,12,12,0,9,0,3,0,56,1017


# Training

In [18]:
# 80%/20% split of the actual data
features, labels = resample(features, labels, n_samples=5000)
feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)
print(label_train.value_counts())

0    3816
1     184
Name: label, dtype: int64


# Decision Tree

In [19]:
# Decision Tree Classifier without SMOTE
def decision_tree_no_SMOTE(features):
    print("Decision Tree, no SMOTE")
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
    decision_tree.fit(feat_train, label_train) # train model

    label_predict = decision_tree.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of decision tree: ", accuracy*100)

    # DOING CROSS VALIDATION 
    # outer loop for CV
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

    scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

    # find the best parameters for decision tree
    params = {"decision_tree__max_depth": [5,10,15,20],  
              "decision_tree__min_samples_leaf": [5,10,15,20], 
              "decision_tree__max_features": [5,10]}
    
    pipeline = Pipeline([('decision_tree', decision_tree)])
    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # inner & outer loop
    decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)
    # prints the accuracy of your decision tree
    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    # Model Analysis: Confusion Matrix
    # running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Decision Tree classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    decision_tree.fit(feat_train, label_train)
    proba = decision_tree.predict_proba(feat_test)

    # Model Analysis: ROC Curve
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)



In [20]:
# Decision Tree Classifier with SMOTE
def decision_tree_SMOTE(features):
    print("Decision Tree, SMOTE")
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
    decision_tree.fit(feat_train, label_train) # train model

    label_predict = decision_tree.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of decision tree: ", accuracy*100)

    smt = SMOTE()
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

    scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

    # find the best parameters for decision tree
    params = {"decision_tree__max_depth": [5,10,15,20],  
              "decision_tree__min_samples_leaf": [5,10,15,20], 
              "decision_tree__max_features": [5,10],
              'smt__k_neighbors': list(range(1, 16, 2))
             }

    pipeline = Pipeline([('smt', smt), ('decision_tree', decision_tree)])
    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # inner & outer loop
    decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)

    # Model Analysis: Confusion Matrix
    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Decision Tree classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    decision_tree.fit(feat_train, label_train)
    proba = decision_tree.predict_proba(feat_test)

    # Model Analysis: ROC Curve
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)



# Naive Bayes

In [21]:
# Naive Bayes Classifier without SMOTE
# simple with CV:
def naive_bayes_no_SMOTE(features):
    print("Naive Bayes, no SMOTE")
    naive_bayes = sk.naive_bayes.GaussianNB()
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    print("Accuracy of simple Naive Bayes:", scores.mean()*100)

    naive_bayes = sk.naive_bayes.GaussianNB()
    # inner loop
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)

    pipeline = Pipeline([('naive_bayes', naive_bayes)])

    naive_bayes.fit(feat_train, label_train)
    
    # outer loop
    decision_acc = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=5)
    # prints the accuracy of your naive bayes
    print("Accuracy of naive bayes with the best parameters and CV: ", decision_acc.mean()*100)

    # Model Analysis: Confusion Matrix
    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    labels_predict = sk.model_selection.cross_val_predict(naive_bayes, features, labels, cv=10)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Naive Bayes classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    naive_bayes.fit(feat_train, label_train)
    proba = naive_bayes.predict_proba(feat_test)
    
    # Model Analysis: ROC Curve
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

In [22]:
# Naive Bayes Classifier with SMOTE
def naive_bayes_SMOTE(features):
    print("Naive Bayes, SMOTE")
    naive_bayes = sk.naive_bayes.GaussianNB()
    
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    print("Accuracy of simple Naive Bayes: ", scores.mean()*100)

    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    params = {
              'smt__k_neighbors': list(range(1, 16, 2))
            }

    smt = SMOTE()
    pipeline = Pipeline([('smt', smt), ('naive_bayes', naive_bayes)]) 
    # inner loop
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
        
    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    naive_bayes.fit(feat_train, label_train)
    
    # outer loop
    decision_acc = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=5)
    # prints the accuracy of your naive bayes
    print("Accuracy of naive bayes with the best parameters and CV: ", decision_acc.mean()*100)

    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)
    
    # Model Analysis: Confusion Matrix
    labels_predict = sk.model_selection.cross_val_predict(naive_bayes, features, labels, cv=10)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Naive Bayes classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    naive_bayes.fit(feat_train, label_train)
    proba = naive_bayes.predict_proba(feat_test)
    
    # Model Analysis: ROC Curve
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

# SVM

In [23]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Support Vector Machines(SVM) Classifier without SMOTE
def svm_no_SMOTE(features):
    scaler_svm = StandardScaler()
    # scaling
    features = scaler_svm.fit_transform(features)
    pca_redux_svm = PCA()
    svm_obj = SVC(probability=True)
    sm = SMOTE()

    svm_obj.fit(feat_train, label_train) # train the model
    label_predict = svm_obj.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_svm = Pipeline([('pca', pca_redux_svm), ('svm', svm_obj)])
    
    # parameter-grid
    param_grid = {
        'pca__n_components': list(range(3, 13)),
        'svm__kernel': ['linear', 'rbf','poly']
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_svm = GridSearchCV(pipe_svm, param_grid, cv=5, scoring='accuracy')

    grid_svm.fit(features, labels)

    print("Best params: ", grid_svm.best_params_)

    pred_svm = cross_val_score(grid_svm, features, labels, cv=5)
    # prints the accuracy of your neural net
    print("Accuracy of svm with the best parameters and CV: ", pred_svm.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 10-fold CV for the outer loop.
    pred_svm = cross_val_predict(grid_svm, features, labels, cv=10)

    conf_mat = sklearn.metrics.confusion_matrix(labels, pred_svm)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using an SVM classifier on this data.
    report_svm = classification_report(labels, pred_svm)
    print("\nClassification report:\n", report_svm)

    svm_obj.fit(feat_train, label_train)
    proba = svm_obj.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)


In [24]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline

# Support Vector Machines(SVM) Classifier with SMOTE
def svm_SMOTE(features):
    scaler_svm = StandardScaler()
    # scaling
    features = scaler_svm.fit_transform(features)
    pca_redux_svm = PCA()
    svm_obj = SVC(probability=True)
    sm = SMOTE()

    svm_obj.fit(feat_train, label_train)  # train the model
    label_predict = svm_obj.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_svm = Pipeline([('smote', sm), ('pca', pca_redux_svm), ('svm', svm_obj)])

    # parameter-grid
    param_grid = {
        'pca__n_components': list(range(3, 13)),
        'svm__kernel': ['linear', 'rbf', 'poly'],
        'smote__k_neighbors': list(range(1, 10, 2))
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_svm = GridSearchCV(pipe_svm, param_grid, cv=5, scoring='accuracy')

    grid_svm.fit(features, labels)
    print("Best params: ", grid_svm.best_params_)

    pred_svm = cross_val_score(grid_svm, features, labels, cv=5)
    # prints the accuracy of your neural net
    print("Accuracy of svm with the best parameters and CV: ", pred_svm.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 10-fold CV for the outer loop.
    pred_svm = cross_val_predict(grid_svm, features, labels, cv=10)

    conf_mat = sklearn.metrics.confusion_matrix(labels, pred_svm)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using an SVM classifier on this data.
    report_svm = classification_report(labels, pred_svm)
    print("\nClassification report:\n", report_svm)

    svm_obj.fit(feat_train, label_train)
    proba = svm_obj.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)


# Nearest Neighbors

In [25]:
# Nearest Neighbors Classifier without SMOTE
def nearest_neighbors_no_SMOTE(features):
    print("Nearest Neighbors, no SMOTE")
    standard_scaler = sk.preprocessing.StandardScaler()
    # scaling
    features = standard_scaler.fit_transform(features)
    pca = sk.decomposition.PCA()
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    knn.fit(feat_train, label_train)  # train the model
    label_predict = knn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple KNN: ", accuracy*100)

    knn = sk.neighbors.KNeighborsClassifier()
    pipeline = Pipeline([('pca', pca), ('knn', knn)])
    # inner loop
    scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)

    param_grid = {
        'pca__n_components': list(range(1, 14)),
        'knn__n_neighbors': list(range(1, 26, 2))
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # this does the nested loop
    scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5, n_jobs=-1)
    # Prints the accuracy of your knn
    print("Accuracy of knn with the best parameters and CV: ", scores.mean()*100)

    # Model Analysis: Confusion Matrix
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5, n_jobs=-1)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a nearest neighbor classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    knn.fit(feat_train, label_train)
    proba = knn.predict_proba(feat_test)

    # Model Analysis: ROC Curve
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

In [26]:
from imblearn.pipeline import Pipeline

# Nearest Neighbors Classifier with SMOTE
def nearest_neighbors_SMOTE(features):
    print("Nearest Neighbors, SMOTE")

    standard_scaler = sk.preprocessing.StandardScaler()
    # scaling
    features = standard_scaler.fit_transform(features)
    pca = sk.decomposition.PCA()
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    knn.fit(feat_train, label_train)  # train the model
    label_predict = knn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple KNN: ", accuracy*100)

    knn = sk.neighbors.KNeighborsClassifier()
    smt = SMOTE()
    pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)])
    # inner loop
    scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)
    
    param_grid = {
        'pca__n_components': list(range(1, 14)),
        'knn__n_neighbors': list(range(1, 26, 2)),
        'smt__k_neighbors': list(range(1, 26, 2))
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # this does the nested loop
    scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5, n_jobs=-1)
    # Prints the accuracy of your knn
    print("Accuracy of knn with the best parameters and CV: ", scores.mean()*100)

    # Model Analysis: Confusion Matrix
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5, n_jobs=-1)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a nearest neighbor classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    knn.fit(feat_train, label_train)
    proba = knn.predict_proba(feat_test)

    # Model Analysis: ROC Curve
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

# Neural Network

In [27]:
from sklearn.neural_network import MLPClassifier

# Neural Network Classifier without SMOTE
def neural_network_no_SMOTE(features):
    scaler_nn = StandardScaler()
    mlp_nn = MLPClassifier()
    # scaling
    features = scaler_nn.fit_transform(features)

    mlp_nn.fit(feat_train, label_train)  # train the model
    label_predict = mlp_nn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_nn = Pipeline([('nn', mlp_nn)])
    # Try values of hidden_layer_sizes ranging from (30,) to (60,) by increments of 10.
    param_grid_nn = {
        'nn__hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
        'nn__activation': ['logistic', 'tanh', 'relu']
    }

    # Use GridSearchCV with 5 fold cross validation to find the best hidden layer size and the best activation function.
    grid_nn = GridSearchCV(pipe_nn, param_grid_nn, cv=5, scoring='accuracy')

    grid_nn.fit(features, labels)
    print("Best params: ", grid_nn.best_params_)
    # Wrapping the GridSearchCV in a 5-fold cross_val_score.
    pred_nn = cross_val_score(grid_nn, features, labels, cv=5)
    # Prints the accuracy of your neural net
    print("Accuracy of neural network with the best parameters and CV: ", pred_nn.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = cross_val_predict(grid_nn, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)
    
    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    mlp_nn.fit(feat_train, label_train)
    proba = mlp_nn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

In [28]:
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline

# Neural Network Classifier with SMOTE
def neural_network_SMOTE(features):
    scaler_nn = StandardScaler()
    mlp_nn = MLPClassifier()
    smt = SMOTE()
    # scaling
    features = scaler_nn.fit_transform(features)

    mlp_nn.fit(feat_train, label_train)  # train the model
    label_predict = mlp_nn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_nn = Pipeline([('smt', smt), ('nn', mlp_nn)])
    # Try values of hidden_layer_sizes ranging from (30,) to (60,) by increments of 10.
    param_grid_nn = {
        'nn__hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
        'nn__activation': ['logistic', 'tanh', 'relu'],
        'smt__k_neighbors': list(range(1, 18, 2))
    }

    # Use GridSearchCV with 5 fold cross validation to find the best hidden layer size and the best activation function.
    grid_nn = GridSearchCV(pipe_nn, param_grid_nn, cv=5, scoring='accuracy')

    grid_nn.fit(features, labels)
    print("Best params: ", grid_nn.best_params_)
    # Wrapping the GridSearchCV in a 5-fold cross_val_score.
    pred_nn = cross_val_score(grid_nn, features, labels, cv=5)
    # Prints the accuracy of your neural net
    print("Accuracy of neural network with the best parameters and CV: ", pred_nn.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = cross_val_predict(grid_nn, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    mlp_nn.fit(feat_train, label_train)
    proba = mlp_nn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

# Ensemble Method 

In [29]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline

# using RandomForestClassifier
# Ensemble Classifier with SMOTE
def ensemble_SMOTE(features):
    rf = sklearn.ensemble.RandomForestClassifier()

    rf.fit(feat_train, label_train) # train the model
    label_predict = rf.predict(feat_test) # predict labels of test data
    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple ensemble: ", accuracy*100)

    smt = SMOTE()
    pipe_ensemble = Pipeline([('smt', smt), ('rf', rf)])
    # parameter-grid
    params_rf = {'rf__max_depth': list(range(35,56)), 
                 'rf__min_samples_leaf': [8,10,12], 
                 'rf__max_features': ['sqrt','log2'],
                 'smt__k_neighbors': list(range(1, 16, 2)),
    }

    # Using GridSearchCV with a 5-fold CV to tune the hyperparameters to get the best results.
    grid_search_rf = GridSearchCV(pipe_ensemble, params_rf, cv=5, scoring='accuracy')


    grid_search_rf.fit(features, labels)
    print("Best params: ", grid_search_rf.best_params_)
    # Wrapping the GridSearchCV in a cross_val_score with 5-fold CV to report the accuracy of the model.
    pred_rf = sk.model_selection.cross_val_score(grid_search_rf, features, labels, cv=5)
    print("Accuracy of ensemble with the best parameters and CV: ", pred_rf.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search_rf, features, labels, cv=5)

    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    rf.fit(feat_train, label_train)
    proba = rf.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

# Predicting

In [30]:
# set of classifiers without SMOTE
decision_tree_no_SMOTE(features)
naive_bayes_no_SMOTE(features)
svm_no_SMOTE(features)
nearest_neighbors_no_SMOTE(features)
neural_network_no_SMOTE(features)
ensemble_no_SMOTE(features)

# set of classifiers with SMOTE
decision_tree_SMOTE(features)
naive_bayes_SMOTE(features)
svm_SMOTE(features)
nearest_neighbors_SMOTE(features)
neural_network_SMOTE(features)
ensemble_SMOTE(features)


Decision Tree, no SMOTE
Accuracy of decision tree:  97.3
Best params:  {'decision_tree__max_depth': 5, 'decision_tree__max_features': 10, 'decision_tree__min_samples_leaf': 15}
Accuracy of decision tree with the best parameters and CV:  97.02027888027887
Confusion matrix: 
 [[4719   50]
 [  89  142]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4769
           1       0.74      0.61      0.67       231

    accuracy                           0.97      5000
   macro avg       0.86      0.80      0.83      5000
weighted avg       0.97      0.97      0.97      5000

ROC AUC score, how good is this model?:  0.8538322430845482
Naive Bayes, no SMOTE
Accuracy of simple Naive Bayes: 93.49987479949921
Accuracy of naive bayes with the best parameters and CV:  93.22001742001743
Accuracy of decision tree with the best parameters and CV:  93.22001742001743
Confusion matrix: 
 [[4504  265]
 [  60  171]]

Classificati