# Training

In [31]:
# decision tree? robust with noise (especially if pruned), can handle irrelevant data
# Naive bayes? not too good because of independence assumption
# SVM? widely used, need to find the best kernel
# nearest neighbors? data must be scaled, not too good with irrelevant features
# neural net? requires a lot of time and a lot of data, can deal with irrelevant features, can overfit, local minima issues
# ensemble? ensemble classifiers combine the predictions of multiple base estimators to improve the accuracy of the predictions. One of the key assumptions that ensemble classifiers make is that the base estimators are built independently (so they are diverse)

# decision tree, svm, nearest neigh, ensemble

In [32]:
# get clean data
import os
import sys
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn.naive_bayes
import sklearn.model_selection
import sklearn.tree
from sklearn.model_selection import GridSearchCV
import sklearn.preprocessing as preprocessing
import sklearn.pipeline
import sklearn.decomposition
import sklearn.neighbors
import sklearn.svm
import sklearn.ensemble
from sklearn.utils import resample
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from collections import Counter
from sklearn.datasets import make_classification
# need to pip install import_ipynb
import import_ipynb
# need to pip install -U imbalanced-learn
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline
from data_preperation import features, labels

# at this point, data should be clean 
print(features.head())
#print(labels)


   congress  bill sponsor_party sponsor_state  cosponsors  r_cosponsors  \
0       113     1             R            MI           0             0   
1       113     2             R            NE          15            15   
2       113     3             R            NE         134           132   
3       113     4             R            MI           4             4   
4       113     5             R            MN          12            12   

   d_cosponsors                       subject  withdrawn_cosponsors  \
0             0                      Taxation                     0   
1             0                        Energy                     0   
2             2                        Energy                     1   
3             0  Economics and Public Finance                     0   
4             0                     Education                     0   

   committees  subcommittees  actions  summary_words  
0           1              0        1           2854  
1           

# Feature Engineering 

In [33]:
# maybe do PCA?
# need to transform categorical data
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
features = features.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit_transform(features)

features.head()

Unnamed: 0,congress,bill,sponsor_party,sponsor_state,cosponsors,r_cosponsors,d_cosponsors,subject,withdrawn_cosponsors,committees,subcommittees,actions,summary_words
0,0,0,1,24,0,0,0,30,0,1,0,0,933
1,0,1,1,32,15,15,0,11,0,5,2,23,1033
2,0,2,1,32,134,130,2,11,1,3,4,54,492
3,0,3,1,24,4,4,0,8,0,9,1,25,1020
4,0,4,1,25,12,12,0,9,0,3,0,56,1017


# Training

In [34]:
# 80%/20% split of the actual data
features, labels = resample(features, labels, n_samples=5000)
feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)

print(label_train.value_counts())


0    3792
1     208
Name: label, dtype: int64


# Decision Tree

In [39]:
# Decision Tree Classifier without SMOTE
def decision_tree_no_SMOTE():
    print("Decision Tree, no SMOTE")
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
    decision_tree.fit(feat_train, label_train) # train model

    label_predict = decision_tree.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of decision tree: ", accuracy*100)

    # DOING CROSS VALIDATION 
    # outer loop for CV
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

    scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

    # find the best parameters for decision trees manually or using grid search, INNER CV LOOOP
    params = {"decision_tree__max_depth": [5,10,15,20],  
              "decision_tree__min_samples_leaf": [5,10,15,20], 
              "decision_tree__max_features": [5,10]
             }

    pipeline = Pipeline([('decision_tree', decision_tree)])
    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # inner & outer loop
    decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)
    # prints the accuracy of your decision tree
    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Decision Tree classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    decision_tree.fit(feat_train, label_train)
    # This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
    # Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
    # probability that this datapoint is in class 1.
    proba = decision_tree.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)



Decision Tree, no SMOTE
Accuracy of decision tree:  95.8
Best params:  {'decision_tree__max_depth': 20, 'decision_tree__max_features': 10, 'decision_tree__min_samples_leaf': 20}
Accuracy of decision tree with the best parameters and CV:  95.98023836023837
Confusion matrix: 
 [[4661   78]
 [ 128  133]]

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      4739
           1       0.63      0.51      0.56       261

    accuracy                           0.96      5000
   macro avg       0.80      0.75      0.77      5000
weighted avg       0.96      0.96      0.96      5000

ROC AUC score, how good is this model?:  0.7997051264170868


In [40]:
from imblearn.pipeline import Pipeline

# Decision Tree Classifier with SMOTE
def decision_tree_SMOTE():
    print("Decision Tree, SMOTE")
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
    decision_tree.fit(feat_train, label_train) # train model

    label_predict = decision_tree.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of decision tree: ", accuracy*100)

    # DOING CROSS VALIDATION 
    smt = SMOTE()
    # outer loop for CV
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

    scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

    # find the best parameters for decision trees manually or using grid search, INNER CV LOOOP
    params = {"decision_tree__max_depth": [5,10,15,20],  
              "decision_tree__min_samples_leaf": [5,10,15,20], 
              "decision_tree__max_features": [5,10],
              'smt__k_neighbors': list(range(1, 16, 2))
             }

    pipeline = Pipeline([('smt', smt), ('decision_tree', decision_tree)])
    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # inner & outer loop
    decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)
    # prints the accuracy of your decision tree
    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Decision Tree classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    decision_tree.fit(feat_train, label_train)
    proba = decision_tree.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)



Decision Tree, SMOTE
Accuracy of decision tree:  95.5
Best params:  {'decision_tree__max_depth': 15, 'decision_tree__max_features': 10, 'decision_tree__min_samples_leaf': 5, 'smt__k_neighbors': 5}
Accuracy of decision tree with the best parameters and CV:  96.18027832027832
Confusion matrix: 
 [[4604  135]
 [  53  208]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      4739
           1       0.61      0.80      0.69       261

    accuracy                           0.96      5000
   macro avg       0.80      0.88      0.83      5000
weighted avg       0.97      0.96      0.96      5000

ROC AUC score, how good is this model?:  0.7614412942559423


# Naive Bayes

In [43]:
# Naive Bayes Classifier without SMOTE
# TODO: maybe do confusion matrix??? Just to analyze model more, maybe roc curve is enough?
# simple with CV:
def naive_bayes_no_SMOTE():
    print("Naive Bayes, no SMOTE")
    naive_bayes = sk.naive_bayes.GaussianNB()
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    print("Accuracy of simple Naive Bayes:", scores.mean()*100)

    naive_bayes = sk.naive_bayes.GaussianNB()
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)

    pipeline = Pipeline([('naive_bayes', naive_bayes)])

    naive_bayes.fit(feat_train, label_train)
    decision_acc = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=5)
    # prints the accuracy of your naive bayes
    print("Accuracy of naive bayes with the best parameters and CV: ", decision_acc.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 10-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(naive_bayes, features, labels, cv=10)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Naive Bayes classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    naive_bayes.fit(feat_train, label_train)
    # This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
    # Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
    # probability that this datapoint is in class 1.
    proba = naive_bayes.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Naive Bayes, no SMOTE
Accuracy of simple Naive Bayes: 92.60071408285633
Accuracy of naive bayes with the best parameters and CV:  92.7003172003172
Confusion matrix: 
 [[4459  280]
 [  90  171]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96      4739
           1       0.38      0.66      0.48       261

    accuracy                           0.93      5000
   macro avg       0.68      0.80      0.72      5000
weighted avg       0.95      0.93      0.94      5000

ROC AUC score, how good is this model?:  0.9296288179155626


In [44]:
# Naive Bayes Classifier with SMOTE
def naive_bayes_SMOTE():
    print("Naive Bayes, SMOTE")
    naive_bayes = sk.naive_bayes.GaussianNB()
    
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    print("Accuracy of simple Naive Bayes: ", scores.mean()*100)

    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    params = {
              'smt__k_neighbors': list(range(1, 16, 2))
            }

    smt = SMOTE()
    pipeline = Pipeline([('smt', smt), ('naive_bayes', naive_bayes)]) 
    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    naive_bayes.fit(feat_train, label_train)
    decision_acc = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=5)
    # prints the accuracy of your naive bayes
    print("Accuracy of naive bayes with the best parameters and CV: ", decision_acc.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 10-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(naive_bayes, features, labels, cv=10)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a Naive Bayes classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    naive_bayes.fit(feat_train, label_train)
    # This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
    # Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
    # probability that this datapoint is in class 1.
    proba = naive_bayes.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Naive Bayes, SMOTE
Accuracy of simple Naive Bayes:  92.60071408285633
Best params:  {'smt__k_neighbors': 1}
Accuracy of naive bayes with the best parameters and CV:  92.7003172003172
Confusion matrix: 
 [[4459  280]
 [  90  171]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96      4739
           1       0.38      0.66      0.48       261

    accuracy                           0.93      5000
   macro avg       0.68      0.80      0.72      5000
weighted avg       0.95      0.93      0.94      5000

ROC AUC score, how good is this model?:  0.9296288179155626


# SVM

In [47]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Support Vector Machines(SVM) Classifier without SMOTE
def svm_no_SMOTE():
    scaler_svm = StandardScaler()
    # scaling
    features = scaler_svm.fit_transform(features)
    pca_redux_svm = PCA()
    svm_obj = SVC(probability=True)
    sm = SMOTE()

    svm_obj.fit(feat_train, label_train) # train the model
    label_predict = svm_obj.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_svm = Pipeline([('pca', pca_redux_svm), ('svm', svm_obj)])
    
    # parameter-grid
    param_grid = {
        'pca__n_components': list(range(3, 13)),
        'svm__kernel': ['linear', 'rbf','poly']
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_svm = GridSearchCV(pipe_svm, param_grid, cv=5, scoring='accuracy')

    grid_svm.fit(features, labels)

    print("Best params: ", grid_svm.best_params_)

    pred_svm = cross_val_score(grid_svm, features, labels, cv=5)
    # prints the accuracy of your neural net
    print("Accuracy of svm with the best parameters and CV: ", pred_svm.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # running a cross_val_predict with a 10-fold CV for the outer loop.
    pred_svm = cross_val_predict(grid_svm, features, labels, cv=10)

    conf_mat = sklearn.metrics.confusion_matrix(labels, pred_svm)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using an SVM classifier on this data.
    report_svm = classification_report(labels, pred_svm)
    print("\nClassification report:\n", report_svm)

    svm_obj.fit(feat_train, label_train)
    proba = svm_obj.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)


Accuracy of simple neural network:  95.6
Best params:  {'pca__n_components': 11, 'svm__kernel': 'linear'}
Accuracy of svm with the best parameters and CV:  96.68015884015884
Confusion matrix: 
 [[4706   33]
 [ 135  126]]

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      4739
           1       0.79      0.48      0.60       261

    accuracy                           0.97      5000
   macro avg       0.88      0.74      0.79      5000
weighted avg       0.96      0.97      0.96      5000

ROC AUC score, how good is this model?:  0.7141419776453946


In [46]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline

# Support Vector Machines(SVM) Classifier with SMOTE
def svm_SMOTE():
    scaler_svm = StandardScaler()
    # scaling
    features = scaler_svm.fit_transform(features)
    pca_redux_svm = PCA()
    svm_obj = SVC(probability=True)
    sm = SMOTE()

    svm_obj.fit(feat_train, label_train)  # train the model
    label_predict = svm_obj.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_svm = Pipeline([('smote', sm), ('pca', pca_redux_svm), ('svm', svm_obj)])

    # parameter-grid
    param_grid = {
        'pca__n_components': list(range(3, 13)),
        'svm__kernel': ['linear', 'rbf', 'poly'],
        'smote__k_neighbors': list(range(1, 10, 2))
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold 
    grid_svm = GridSearchCV(pipe_svm, param_grid, cv=5, scoring='accuracy')

    grid_svm.fit(features, labels)
    print("Best params: ", grid_svm.best_params_)

    pred_svm = cross_val_score(grid_svm, features, labels, cv=5)
    # prints the accuracy of your neural net
    print("Accuracy of svm with the best parameters and CV: ", pred_svm.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 10-fold CV for the outer loop.
    pred_svm = cross_val_predict(grid_svm, features, labels, cv=10)

    conf_mat = sklearn.metrics.confusion_matrix(labels, pred_svm)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using an SVM classifier on this data.
    report_svm = classification_report(labels, pred_svm)
    print("\nClassification report:\n", report_svm)

    svm_obj.fit(feat_train, label_train)
    proba = svm_obj.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)


Accuracy of simple neural network:  95.6
Best params:  {'pca__n_components': 9, 'smote__k_neighbors': 1, 'svm__kernel': 'rbf'}
Accuracy of svm with the best parameters and CV:  94.8601578001578
Confusion matrix: 
 [[4514  225]
 [  26  235]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      4739
           1       0.51      0.90      0.65       261

    accuracy                           0.95      5000
   macro avg       0.75      0.93      0.81      5000
weighted avg       0.97      0.95      0.96      5000

ROC AUC score, how good is this model?:  0.717160447092108


# Nearest Neighbors

In [48]:
# Nearest Neighbors Classifier without SMOTE
def nearest_neighbors_no_SMOTE():
    standard_scaler = sk.preprocessing.StandardScaler()
    # scaling
    features = standard_scaler.fit_transform(features)
    pca = sk.decomposition.PCA()
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    knn.fit(feat_train, label_train)  # train the model
    label_predict = knn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple KNN: ", accuracy*100)

    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)
    pipeline = Pipeline([('pca', pca), ('knn', knn)])
    scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)

    param_grid = {
        'pca__n_components': list(range(1, 14)),
        'knn__n_neighbors': list(range(1, 26, 2))
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # this does the nested loop
    scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5, n_jobs=-1)
    # Prints the accuracy of your knn
    print("Accuracy of knn with the best parameters and CV: ", scores.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5, n_jobs=-1)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a nearest neighbor classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    knn.fit(feat_train, label_train)
    proba = knn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Accuracy of simple KNN:  94.8
Best params:  {'knn__n_neighbors': 1, 'pca__n_components': 12}
Accuracy of knn with the best parameters and CV:  96.35995855995854
Confusion matrix: 
 [[4650   89]
 [  93  168]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4739
           1       0.65      0.64      0.65       261

    accuracy                           0.96      5000
   macro avg       0.82      0.81      0.81      5000
weighted avg       0.96      0.96      0.96      5000

ROC AUC score, how good is this model?:  0.6975553386065232


In [49]:
from imblearn.pipeline import Pipeline

# Nearest Neighbors Classifier with SMOTE
def nearest_neighbors_SMOTE():

    standard_scaler = sk.preprocessing.StandardScaler()
    # scaling
    features = standard_scaler.fit_transform(features)
    pca = sk.decomposition.PCA()
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    knn.fit(feat_train, label_train)  # train the model
    label_predict = knn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple KNN: ", accuracy*100)

    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)
    smt = SMOTE()
    pipeline = Pipeline([('smt', smt), ('pca', pca), ('knn', knn)])
    scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)
    
    param_grid = {
        'pca__n_components': list(range(1, 14)),
        'knn__n_neighbors': list(range(1, 26, 2)),
        'smt__k_neighbors': list(range(1, 26, 2))
    }

    # creating a GridSearchCV for the inner CV loop with 5-fold
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(features, labels)
    print("Best params: ", grid_search.best_params_)

    # this does the nested loop
    scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5, n_jobs=-1)
    # Prints the accuracy of your knn
    print("Accuracy of knn with the best parameters and CV: ", scores.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5, n_jobs=-1)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a nearest neighbor classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    knn.fit(feat_train, label_train)
    proba = knn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Accuracy of simple KNN:  94.8
Best params:  {'knn__n_neighbors': 1, 'pca__n_components': 13, 'smt__k_neighbors': 1}
Accuracy of knn with the best parameters and CV:  96.36001858001858
Confusion matrix: 
 [[4641   98]
 [  85  176]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4739
           1       0.64      0.67      0.66       261

    accuracy                           0.96      5000
   macro avg       0.81      0.83      0.82      5000
weighted avg       0.96      0.96      0.96      5000

ROC AUC score, how good is this model?:  0.6975553386065232


# Neural Network

In [50]:
from sklearn.neural_network import MLPClassifier

# Neural Network Classifier without SMOTE
def neural_network_no_SMOTE():
    scaler_nn = StandardScaler()
    mlp_nn = MLPClassifier()
    # scaling
    features = scaler_nn.fit_transform(features)

    mlp_nn.fit(feat_train, label_train)  # train the model
    label_predict = mlp_nn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_nn = Pipeline([('nn', mlp_nn)])
    # Try values of hidden_layer_sizes ranging from (30,) to (60,) by increments of 10.
    param_grid_nn = {
        'nn__hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
        'nn__activation': ['logistic', 'tanh', 'relu']
    }

    # Use GridSearchCV with 5 fold cross validation to find the best hidden layer size and the best activation function.
    grid_nn = GridSearchCV(pipe_nn, param_grid_nn, cv=5, scoring='accuracy')

    grid_nn.fit(features, labels)
    print("Best params: ", grid_nn.best_params_)
    # Wrapping the GridSearchCV in a 5-fold cross_val_score.
    pred_nn = cross_val_score(grid_nn, features, labels, cv=5)
    # Prints the accuracy of your neural net
    print("Accuracy of neural network with the best parameters and CV: ", pred_nn.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = cross_val_predict(grid_nn, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)
    
    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    mlp_nn.fit(feat_train, label_train)
    proba = mlp_nn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Accuracy of simple neural network:  94.89999999999999
Best params:  {'nn__activation': 'tanh', 'nn__hidden_layer_sizes': (40,)}
Accuracy of neural network with the best parameters and CV:  96.74005858005857
Confusion matrix: 
 [[4676   63]
 [  91  170]]

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      4739
           1       0.73      0.65      0.69       261

    accuracy                           0.97      5000
   macro avg       0.86      0.82      0.84      5000
weighted avg       0.97      0.97      0.97      5000

ROC AUC score, how good is this model?:  0.9609093263732542


In [51]:
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline

# Neural Network Classifier with SMOTE
def neural_network_SMOTE():
    scaler_nn = StandardScaler()
    mlp_nn = MLPClassifier()
    smt = SMOTE()
    # scaling
    features = scaler_nn.fit_transform(features)

    mlp_nn.fit(feat_train, label_train)  # train the model
    label_predict = mlp_nn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple neural network: ", accuracy*100)

    pipe_nn = Pipeline([('smt', smt), ('nn', mlp_nn)])
    # Try values of hidden_layer_sizes ranging from (30,) to (60,) by increments of 10.
    param_grid_nn = {
        'nn__hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
        'nn__activation': ['logistic', 'tanh', 'relu'],
        'smt__k_neighbors': list(range(1, 18, 2))
    }

    # Use GridSearchCV with 5 fold cross validation to find the best hidden layer size and the best activation function.
    grid_nn = GridSearchCV(pipe_nn, param_grid_nn, cv=5, scoring='accuracy')

    grid_nn.fit(features, labels)
    print("Best params: ", grid_nn.best_params_)
    # Wrapping the GridSearchCV in a 5-fold cross_val_score.
    pred_nn = cross_val_score(grid_nn, features, labels, cv=5)
    # Prints the accuracy of your neural net
    print("Accuracy of neural network with the best parameters and CV: ", pred_nn.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = cross_val_predict(grid_nn, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    mlp_nn.fit(feat_train, label_train)
    proba = mlp_nn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Accuracy of simple neural network:  96.1
Best params:  {'nn__activation': 'tanh', 'nn__hidden_layer_sizes': (50,), 'smt__k_neighbors': 3}
Accuracy of neural network with the best parameters and CV:  96.08019832019832
Confusion matrix: 
 [[4601  138]
 [  39  222]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      4739
           1       0.62      0.85      0.71       261

    accuracy                           0.96      5000
   macro avg       0.80      0.91      0.85      5000
weighted avg       0.97      0.96      0.97      5000

ROC AUC score, how good is this model?:  0.939112589906557


# Ensemble Method 

In [52]:
from sklearn.ensemble import RandomForestClassifier

# using RandomForestClassifier
# Ensemble Classifier without SMOTE
def ensemble_no_SMOTE():
    rf = sklearn.ensemble.RandomForestClassifier()
    
    rf.fit(feat_train, label_train)  # train the model
    label_predict = rf.predict(feat_test)  # predict labels of test data
    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple ensemble: ", accuracy*100)
    
    pipe_ensemble = Pipeline([('rf', rf)])
    # parameter-grid
    params_rf = {'rf__max_depth': list(range(35,56)), 
                 'rf__min_samples_leaf': [8,10,12], 
                 'rf__max_features': ['sqrt','log2']
    }


    # Using GridSearchCV with a 5-fold CV to tune the hyperparameters to get the best results.
    grid_search_rf = GridSearchCV(pipe_ensemble, params_rf, cv=5, scoring='accuracy')

    grid_search_rf.fit(features, labels)
    print("Best params: ", grid_search_rf.best_params_)
    # Wrapping the GridSearchCV in a cross_val_score with 5-fold CV to report the accuracy of the model.
    pred_rf = sk.model_selection.cross_val_score(grid_search_rf, features, labels, cv=5)
    print("Accuracy of ensemble with the best parameters and CV: ", pred_rf.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search_rf, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    rf.fit(feat_train, label_train)
    proba = rf.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)


Accuracy of simple ensemble:  96.0
Best params:  {'rf__max_depth': 40, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 8}
Accuracy of ensemble with the best parameters and CV:  96.53987871987871
Confusion matrix: 
 [[4706   33]
 [ 136  125]]

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      4739
           1       0.79      0.48      0.60       261

    accuracy                           0.97      5000
   macro avg       0.88      0.74      0.79      5000
weighted avg       0.96      0.97      0.96      5000

ROC AUC score, how good is this model?:  0.9715885318084915


In [53]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline

# using RandomForestClassifier
# Ensemble Classifier with SMOTE
def ensemble_SMOTE():
    rf = sklearn.ensemble.RandomForestClassifier()

    rf.fit(feat_train, label_train) # train the model
    label_predict = rf.predict(feat_test) # predict labels of test data
    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple ensemble: ", accuracy*100)

    smt = SMOTE()
    pipe_ensemble = Pipeline([('smt', smt), ('rf', rf)])
    # parameter-grid
    params_rf = {'rf__max_depth': list(range(35,56)), 
                 'rf__min_samples_leaf': [8,10,12], 
                 'rf__max_features': ['sqrt','log2'],
                 'smt__k_neighbors': list(range(1, 16, 2)),
    }

    # Using GridSearchCV with a 5-fold CV to tune the hyperparameters to get the best results.
    grid_search_rf = GridSearchCV(pipe_ensemble, params_rf, cv=5, scoring='accuracy')


    grid_search_rf.fit(features, labels)
    print("Best params: ", grid_search_rf.best_params_)
    # Wrapping the GridSearchCV in a cross_val_score with 5-fold CV to report the accuracy of the model.
    pred_rf = sk.model_selection.cross_val_score(grid_search_rf, features, labels, cv=5)
    print("Accuracy of ensemble with the best parameters and CV: ", pred_rf.mean()*100)

    # CONFUSION MATRIX EVALUATION
    # Running a cross_val_predict with a 5-fold CV for the outer loop.
    labels_predict = sk.model_selection.cross_val_predict(grid_search_rf, features, labels, cv=5)

    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("Confusion matrix: \n", conf_mat)

    # CLASSIFICATION REPORT of using a neural network classifier on this data.
    report = classification_report(labels, labels_predict)
    print("\nClassification report:\n", report)

    rf.fit(feat_train, label_train)
    proba = rf.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

Accuracy of simple ensemble:  96.0
Best params:  {'rf__max_depth': 46, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 8, 'smt__k_neighbors': 1}
Accuracy of ensemble with the best parameters and CV:  95.48011824011824
Confusion matrix: 
 [[4569  170]
 [  44  217]]

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98      4739
           1       0.56      0.83      0.67       261

    accuracy                           0.96      5000
   macro avg       0.78      0.90      0.82      5000
weighted avg       0.97      0.96      0.96      5000

ROC AUC score, how good is this model?:  0.9826662150584766


# Predicting

In [54]:
# set of classifiers without SMOTE
decision_tree_no_SMOTE()
naive_bayes_no_SMOTE()
svm_no_SMOTE()
nearest_neighbors_no_SMOTE()
neural_network_no_SMOTE()
ensemble_no_SMOTE()

# set of classifiers with SMOTE
decision_tree_SMOTE()
naive_bayes_SMOTE()
svm_SMOTE()
nearest_neighbors_SMOTE()
neural_network_SMOTE()
ensemble_SMOTE()
