# Training

In [31]:
# decision tree? robust with noise (especially if pruned), can handle irrelevant data
# Naive bayes? not too good because of independence assumption
# SVM? widely used, need to find the best kernel
# nearest neighbors? data must be scaled, not too good with irrelevant features
# neural net? requires a lot of time and a lot of data, can deal with irrelevant features, can overfit, local minima issues
# ensemble?

# decision tree, svm, nearest neigh, ensemble

In [32]:
# get clean data
import os
import sys
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn.naive_bayes
import sklearn.model_selection
import sklearn.tree
from sklearn.model_selection import GridSearchCV
import sklearn.preprocessing as preprocessing
import sklearn.pipeline
import sklearn.decomposition
import sklearn.neighbors
import sklearn.svm
import sklearn.ensemble
from sklearn.utils import resample
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline
import time

# need to pip install import_ipynb
import import_ipynb
from data_preperation import features, labels


# at this point, data should be clean 
print(features.head())
#print(labels)


   congress  bill sponsor_party sponsor_state  cosponsors  r_cosponsors  \
0       113     1             R            MI           0             0   
1       113     2             R            NE          15            15   
2       113     3             R            NE         134           132   
3       113     4             R            MI           4             4   
4       113     5             R            MN          12            12   

   d_cosponsors                       subject  withdrawn_cosponsors  \
0             0                      Taxation                     0   
1             0                        Energy                     0   
2             2                        Energy                     1   
3             0  Economics and Public Finance                     0   
4             0                     Education                     0   

   committees  subcommittees  actions  summary_words  
0           1              0        1           2854  
1           

# Feature Engineering 

In [33]:
# maybe do PCA?
# need to transform categorical data
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
features = features.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit_transform(features)

features.head()

Unnamed: 0,congress,bill,sponsor_party,sponsor_state,cosponsors,r_cosponsors,d_cosponsors,subject,withdrawn_cosponsors,committees,subcommittees,actions,summary_words
0,0,0,1,24,0,0,0,30,0,1,0,0,933
1,0,1,1,32,15,15,0,11,0,5,2,23,1033
2,0,2,1,32,134,130,2,11,1,3,4,54,492
3,0,3,1,24,4,4,0,8,0,9,1,25,1020
4,0,4,1,25,12,12,0,9,0,3,0,56,1017


# Training

In [34]:
# 80%/20% split
features, labels = resample(features, labels, n_samples=5000)
feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)
#print(feat_train.columns)
#print(label_train.value_counts())
#print(type(label_train))
#print(type(feat_train))
#sm = SMOTE()
#feat_train, label_train = sm.fit_sample(feat_train, label_train)
#feat_train, label_train = resample(feat_train, label_train, n_samples=5000)
#sm = SMOTENC(random_state=42, categorical_features=[18, 19])
#feat_train, label_train = SMOTE().fit_resample(feat_train, label_train)
#x = np.linspace(label_train)
#print(label_train)

#y = pd.DataFrame.from_records(label_train)
#print(pd.value_counts(y.values.flatten()))
#print(y.value_counts())
#print(len(feat_train))
#pd.value_counts(df.values.flatten())

print(label_train.value_counts())
#smt = SMOTE(random_state=0)
#feat_resampled, label_resampled = smote.fit_sample(feat_train, label_train)

#feat_train = pd.DataFrame(feat_resampled, columns=feat_train.columns)
#label_train = pd.Series(label_resampled)
#print(label_resampled.head())

#print(label_train.value_counts())

0    3816
1     184
Name: label, dtype: int64


# Decision Tree

In [35]:
# Decision Tree
def decision_tree_no_SMOTE():
    print("Decision Tree, no SMOTE")
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
    decision_tree.fit(feat_train, label_train) # train model

    label_predict = decision_tree.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple decision tree: ", accuracy*100)

    # DOING CROSS VALIDATION 
    #smt = SMOTE()
    # outer loop for CV
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

    scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

    # find the best parameters for decision trees manually or using grid search, INNER CV LOOOP
    #best_params = find_best_params(feat_train, label_train)
    params = {"decision_tree__max_depth": [5,10,15,20],  
              "decision_tree__min_samples_leaf": [5,10,15,20], 
              "decision_tree__max_features": [5,10]}
              #'smt__k_neighbors': list(range(1, 16, 2))


    #smt = SMOTE(ratio=.5)
    #pipeline = sk.pipeline.Pipeline(steps=[('smt', smt), ('decision_tree', decision_tree)])
    #pipeline = Pipeline([('smt', smt), ('decision_tree', decision_tree)])
    pipeline = Pipeline([('decision_tree', decision_tree)])
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("best params: ", grid_search.best_params_)

    # make model with the best parameters, inner loop of CV

    #decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy', 
    #                                               splitter=best_params['splitter'], 
    #                                               max_depth=best_params['max_depth'], 
    #                                               min_impurity_decrease=best_params['min_impurity_decrease'], 
    #                                               min_samples_leaf=best_params['min_samples_leaf'], 
    #                                               min_samples_split=best_params['min_samples_split'])

    #decision_tree.fit(feat_train, label_train)
    #label_predict = decision_tree.predict(feat_test)
    #accuracy = sk.metrics.accuracy_score(label_test, label_predict)

    # inner & outer l
    decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)

    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("confusion matrix: \n", conf_mat)

    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nclassification report:\n", report)

    decision_tree.fit(feat_train, label_train)
    # This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
    # Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
    # probability that this datapoint is in class 1.
    proba = decision_tree.predict_proba(feat_test)

    #print(label_test)
    #print(proba[:, 1])

    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)



In [36]:
# Decision Tree
def decision_tree_SMOTE():
    print("Decision Tree, SMOTE")
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
    decision_tree.fit(feat_train, label_train) # train model

    label_predict = decision_tree.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple decision tree: ", accuracy*100)

    # DOING CROSS VALIDATION 
    smt = SMOTE()
    # outer loop for CV
    decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

    scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

    # find the best parameters for decision trees manually or using grid search, INNER CV LOOOP
    #best_params = find_best_params(feat_train, label_train)
    params = {"decision_tree__max_depth": [5,10,15,20],  
              "decision_tree__min_samples_leaf": [5,10,15,20], 
              "decision_tree__max_features": [5,10],
              'smt__k_neighbors': list(range(1, 16, 2))}


    #smt = SMOTE(ratio=.5)
    #pipeline = sk.pipeline.Pipeline(steps=[('smt', smt), ('decision_tree', decision_tree)])
    pipeline = Pipeline([('smt', smt), ('decision_tree', decision_tree)])
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("best params: ", grid_search.best_params_)

    # inner & outer l
    decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)

    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("confusion matrix: \n", conf_mat)

    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nclassification report:\n", report)

    decision_tree.fit(feat_train, label_train)
    proba = decision_tree.predict_proba(feat_test)

    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)



# Naive Bayes

In [37]:
# naive bayes
# TODO: maybe do confusion matrix??? Just to analyze model more, maybe roc curve is enough?
# simple with CV:
def naive_bayes_no_SMOTE():
    print("Naive Bayes, no SMOTE")
    naive_bayes = sk.naive_bayes.GaussianNB()
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)

    print("simple model Accuracy:", scores.mean()*100)

    #feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)
    naive_bayes = sk.naive_bayes.GaussianNB()


    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    #params = {
    #          'smt__k_neighbors': list(range(1, 16, 2))}


    #smt = SMOTE()
    #pipeline = Pipeline([('smt', smt), ('naive_bayes', naive_bayes)])
    pipeline = Pipeline([('naive_bayes', naive_bayes)])
    #grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    #grid_search = GridSearchCV(pipeline, cv=5, scoring='accuracy')
    #grid_search.fit(features, labels)
    #print("best params: ", grid_search.best_params_)

    naive_bayes.fit(feat_train, label_train)
    decision_acc = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=5)

    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    labels_predict = sk.model_selection.cross_val_predict(naive_bayes, features, labels, cv=10)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("confusion matrix: \n", conf_mat)

    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nclassification report:\n", report)

    naive_bayes.fit(feat_train, label_train)
    # This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
    # Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
    # probability that this datapoint is in class 1.
    proba = naive_bayes.predict_proba(feat_test)

    #print(label_test)
    #print(proba[:, 1])

    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

In [38]:
# naive bayes
# TODO: maybe do confusion matrix??? Just to analyze model more, maybe roc curve is enough?
# simple with CV:
def naive_bayes_SMOTE():
    print("Naive Bayes, SMOTE")
    naive_bayes = sk.naive_bayes.GaussianNB()
    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)

    print("simple model Accuracy:", scores.mean()*100)

    #feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)
    naive_bayes = sk.naive_bayes.GaussianNB()


    scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)
    params = {
              'smt__k_neighbors': list(range(1, 16, 2))}


    smt = SMOTE()
    pipeline = Pipeline([('smt', smt), ('naive_bayes', naive_bayes)])
    grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    print("best params: ", grid_search.best_params_)

    naive_bayes.fit(feat_train, label_train)
    decision_acc = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=5)

    print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)

    labels_predict = sk.model_selection.cross_val_predict(naive_bayes, features, labels, cv=10)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("confusion matrix: \n", conf_mat)

    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nclassification report:\n", report)

    naive_bayes.fit(feat_train, label_train)
    # This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
    # Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
    # probability that this datapoint is in class 1.
    proba = naive_bayes.predict_proba(feat_test)

    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

# SVM

# Nearest Neighbors

In [39]:
# scaling
def nearest_neighbors_no_SMOTE():
    print("scaling...")
    standard_scaler = sk.preprocessing.StandardScaler()
    pca = sk.decomposition.PCA()
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    knn.fit(feat_train, label_train)
    label_predict = knn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple KNN: ", accuracy*100)

    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    #smt = SMOTE()
    #pipeline = Pipeline([('smt', smt), ('standard_scaler', standard_scaler), ('pca', pca), ('knn', knn)])
    pipeline = Pipeline([('standard_scaler', standard_scaler), ('pca', pca), ('knn', knn)])
    #pipeline = sk.pipeline.Pipeline(steps=[('standard_scaler', standard_scaler), ('pca', pca), ('knn', knn)])
    print("finished scaling.")
    # inner loop
    print("starting inner loop...")
    scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)
    #print("Accuracy:", scores.mean()*100)

    param_grid = {
        'pca__n_components': list(range(1, 14)),
        'knn__n_neighbors': list(range(1, 26, 2))}
        #'smt__k_neighbors': list(range(1, 26, 2))


    # param_grid=  {'knn__n_neighbors': [1], 'pca__n_components': [13], 'smt__k_neighbors': [1]}

    #param_grid = {
    #    'pca__n_components': [12],
    #    'knn__n_neighbors': [15]
    #}

    print("starting grid search...")
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(features, labels)
    print("best params: ", grid_search.best_params_)
    #print("Accuracy: ", grid_search.best_score_*100)

    # this does the nested loop
    scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5, n_jobs=-1)

    print("Accuracy with best params:", scores.mean()*100)

    # CONFUSION MATRIX EVALUATION
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5, n_jobs=-1)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("confusion matrix: \n", conf_mat)

    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nclassification report:\n", report)

    knn.fit(feat_train, label_train)
    proba = knn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

In [40]:
# scaling
def nearest_neighbors_SMOTE():
    print("Nearest Neighbors, SMOTE")
    print("scaling...")
    standard_scaler = sk.preprocessing.StandardScaler()
    pca = sk.decomposition.PCA()
    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    knn.fit(feat_train, label_train)
    label_predict = knn.predict(feat_test) # predict labels of test data

    accuracy = sk.metrics.accuracy_score(label_test, label_predict)
    print("Accuracy of simple KNN: ", accuracy*100)

    knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)

    smt = SMOTE()
    pipeline = Pipeline([('smt', smt), ('standard_scaler', standard_scaler), ('pca', pca), ('knn', knn)])
    print("finished scaling.")
    # inner loop
    print("starting inner loop...")
    scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)
    #print("Accuracy:", scores.mean()*100)

    param_grid = {
        'pca__n_components': list(range(1, 14)),
        'knn__n_neighbors': list(range(1, 26, 2)),
        'smt__k_neighbors': list(range(1, 26, 2))}

    print("starting grid search...")
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)
    grid_search.fit(features, labels)
    print("best params: ", grid_search.best_params_)
    #print("Accuracy: ", grid_search.best_score_*100)

    # this does the nested loop
    scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5, n_jobs=-1)

    print("Accuracy with best params:", scores.mean()*100)

    # CONFUSION MATRIX EVALUATION
    labels_predict = sk.model_selection.cross_val_predict(grid_search, features, labels, cv=5, n_jobs=-1)
    conf_mat = sklearn.metrics.confusion_matrix(labels, labels_predict)
    print("confusion matrix: \n", conf_mat)

    report = sklearn.metrics.classification_report(labels, labels_predict)
    print("\nclassification report:\n", report)

    knn.fit(feat_train, label_train)
    proba = knn.predict_proba(feat_test)

    # ROC CURVE EVALUATION
    roc = sk.metrics.roc_curve(label_test, proba[:, 1])
    roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])

    print("ROC AUC score, how good is this model?: ", roc_auc)

# Neural Network

# Ensemble Method 

# Predicting

In [None]:
decision_tree_no_SMOTE()
naive_bayes_no_SMOTE()
nearest_neighbors_no_SMOTE()

decision_tree_SMOTE()
naive_bayes_SMOTE()
nearest_neighbors_SMOTE()

Decision Tree, no SMOTE
Accuracy of simple decision tree:  96.3
best params:  {'decision_tree__max_depth': 5, 'decision_tree__max_features': 10, 'decision_tree__min_samples_leaf': 15}
Accuracy of decision tree with the best parameters and CV:  96.85963739963739
confusion matrix: 
 [[4713   70]
 [ 115  102]]

classification report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      4783
           1       0.59      0.47      0.52       217

    accuracy                           0.96      5000
   macro avg       0.78      0.73      0.75      5000
weighted avg       0.96      0.96      0.96      5000

ROC AUC score, how good is this model?:  0.7179655918021999
Naive Bayes, no SMOTE
simple model Accuracy: 93.47978407913632
Accuracy of decision tree with the best parameters and CV:  93.39973485973486
confusion matrix: 
 [[4522  261]
 [  65  152]]

classification report:
               precision    recall  f1-score   support

           