# Training

In [3]:
# decision tree? robust with noise (especially if pruned), can handle irrelevant data
# Naive bayes? not too good because of independence assumption
# SVM? widely used, need to find the best kernel
# nearest neighbors? data must be scaled, not too good with irrelevant features
# neural net? requires a lot of time and a lot of data, can deal with irrelevant features, can overfit, local minima issues
# ensemble?

# decision tree, svm, nearest neigh, ensemble

In [4]:
# get clean data
import os
import sys
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import sklearn as sk
import sklearn.naive_bayes
import sklearn.model_selection
import sklearn.tree
from sklearn.model_selection import GridSearchCV
import sklearn.preprocessing as preprocessing
import sklearn.pipeline
import sklearn.decomposition
import sklearn.neighbors
import sklearn.svm
import sklearn.ensemble
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# need to pip install import_ipynb
import import_ipynb
from data_preperation import features, labels

# at this point, data should be clean 
print(features.head())
print(labels)


importing Jupyter notebook from data_preperation.ipynb
   congress  bill sponsor_party sponsor_state  cosponsors  r_cosponsors  \
0       113     1             R            MI           0             0   
1       113     2             R            NE          15            15   
2       113     3             R            NE         134           132   
3       113     4             R            MI           4             4   
4       113     5             R            MN          12            12   

   d_cosponsors                       subject  withdrawn_cosponsors  \
0             0                      Taxation                     0   
1             0                        Energy                     0   
2             2                        Energy                     1   
3             0  Economics and Public Finance                     0   
4             0                     Education                     0   

   committees  subcommittees  actions  summary_words  
0           

# Feature Engineering 

In [5]:
# maybe do PCA?
# need to transform categorical data
# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
features = features.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit_transform(features)

features.head()

Unnamed: 0,congress,bill,sponsor_party,sponsor_state,cosponsors,r_cosponsors,d_cosponsors,subject,withdrawn_cosponsors,committees,subcommittees,actions,summary_words
0,0,0,1,24,0,0,0,30,0,1,0,0,933
1,0,1,1,32,15,15,0,11,0,5,2,23,1033
2,0,2,1,32,134,130,2,11,1,3,4,54,492
3,0,3,1,24,4,4,0,8,0,9,1,25,1020
4,0,4,1,25,12,12,0,9,0,3,0,56,1017


# Training

In [6]:
# 80%/20% split
feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)


# Decision Tree

In [7]:
# finds the best parameters for decision trees
# TODO: don't need this function

def find_best_params(feat_train, label_train):
    best_acc = 0
    best_crit = ''
    best_depth = 1
    best_imp_dec = 0
    best_min_samples_leaf = 0
    best_min_samples_split = 0
    """
    for split_criterion in ["best", "random"]:
        for max_depth in [1, 5, 10, 20]:
            for min_impurity_decrease in [.0, .05, .1, .15]:
                for min_samples_leaf in [1, 10, 50, 100]:
                    for min_samples_split in [2, 4, 8, 10]:
                        decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy', splitter=split_criterion, max_depth=max_depth, min_impurity_decrease=min_impurity_decrease, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split) # make model
                        decision_tree.fit(feat_train, label_train) # train model

                        label_predict = decision_tree.predict(feat_test) # predict labels of test data

                        accuracy = sk.metrics.accuracy_score(label_test, label_predict)
                        if accuracy > best_acc:
                            best_acc = accuracy
                            best_crit = split_criterion
                            best_depth = max_depth
                            best_imp_dec = min_impurity_decrease
                            best_min_samples_leaf =  min_samples_leaf
                            best_min_samples_split = min_samples_split
                        #print("Accuracy of the classifier on the test set with splitter={}, max_depth={}, min_impurity_decrease={}, min_samples_leaf={}, min_samples_split={}: {}".format(split_criterion, max_depth, min_impurity_decrease, min_samples_leaf, min_samples_split, accuracy*100))
    
    """
    params = {"max_depth": [5,10,15,20],  "min_samples_leaf": [5,10,15,20], "max_features": [5,10,15], "splitter": ["best", "random"], "min_impurity_decrease":[.0, .05, .1, .15], "min_samples_split":[2, 4, 8, 10]}

    grid_search = GridSearchCV(decision_tree, params, cv=5, scoring='accuracy')
    grid_search.fit(features, labels)
    # returns a map?
    return grid_search.best_params_

    #print(grid_search.best_params_)
    #print("Accuracy of best params:", grid_search.best_score_*100)
    #return best_acc, best_crit, best_depth, best_imp_dec, best_min_samples_leaf, best_min_samples_split



In [8]:
# Decision Tree

decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model
decision_tree.fit(feat_train, label_train) # train model

label_predict = decision_tree.predict(feat_test) # predict labels of test data

accuracy = sk.metrics.accuracy_score(label_test, label_predict)
print("Accuracy of simple decision tree: ", accuracy*100)

# DOING CROSS VALIDATION 

# outer loop for CV
decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy') # make model

scores = sk.model_selection.cross_val_score(decision_tree, features, labels, cv=10) 

# find the best parameters for decision trees manually or using grid search, INNER CV LOOOP
#best_params = find_best_params(feat_train, label_train)
params = {"max_depth": [5,10,15,20],  "min_samples_leaf": [5,10,15,20], "max_features": [5,10]}
grid_search = GridSearchCV(decision_tree, params, cv=5, scoring='accuracy')
grid_search.fit(features, labels)

# make model with the best parameters, inner loop of CV

#decision_tree = sk.tree.DecisionTreeClassifier(criterion='entropy', 
#                                               splitter=best_params['splitter'], 
#                                               max_depth=best_params['max_depth'], 
#                                               min_impurity_decrease=best_params['min_impurity_decrease'], 
#                                               min_samples_leaf=best_params['min_samples_leaf'], 
#                                               min_samples_split=best_params['min_samples_split'])

#decision_tree.fit(feat_train, label_train)
#label_predict = decision_tree.predict(feat_test)
#accuracy = sk.metrics.accuracy_score(label_test, label_predict)

# inner & outer l
decision_acc = sk.model_selection.cross_val_score(grid_search, features, labels, cv=10)

print("Accuracy of decision tree with the best parameters and CV: ", decision_acc.mean()*100)



Accuracy of simple decision tree:  96.24556062912227
Accuracy of decision tree with the best parameters and CV:  96.49362456121374


# Naive Bayes

In [9]:
# naive bayes
# TODO: maybe do confusion matrix??? Just to analyze model more, maybe roc curve is enough?
# simple with CV:

naive_bayes = sk.naive_bayes.GaussianNB()
scores = sk.model_selection.cross_val_score(naive_bayes, features, labels, cv=10)

print("Accuracy:", scores.mean()*100)

feat_train, feat_test, label_train, label_test = sk.model_selection.train_test_split(features, labels, test_size=0.2)
naive_bayes = sk.naive_bayes.GaussianNB()
naive_bayes.fit(feat_train, label_train)
# This will return a 2D numpy array with one row for each datapoint in the test set and 2 columns. 
# Column index 0 is the probability that this datapoint is in class 0, and column index 1 is the 
# probability that this datapoint is in class 1.
proba = naive_bayes.predict_proba(feat_test)

roc = sk.metrics.roc_curve(label_test, proba[:, 1])

roc_auc = sk.metrics.roc_auc_score(label_test, proba[:, 1])
acc = naive_bayes.score(feat_train, label_train)

print("ROC AUC score, how good is this model?: ", roc_auc)
print("Accuracy of this model: ", acc*100)


Accuracy: 92.86071127799798
ROC AUC score, how good is this model?:  0.9495163940116158
Accuracy of this model:  92.97811607992388


# SVM

In [None]:
from sklearn.svm import SVC

scaler_svm = StandardScaler()
pca_redux_svm = PCA()
svm_obj = SVC()
pipe_svm = Pipeline([('scale', scaler_svm), ('pca', pca_redux_svm), ('svm', svm_obj)])
print("start")
param_grid = {
    'pca__n_components': list(range(5, 13)),
    'svm__kernel': ['linear', 'rbf', 'poly']
}
print("before grid")
grid_svm = GridSearchCV(pipe_svm, param_grid, cv=5)
print("before predict")
pred_svm = cross_val_predict(grid_svm, features, labels, cv=5)
print("done")
print("Accuracy:", pred_svm.mean()*100)

report_svm = classification_report(labels, pred_svm)
print(report_svm)



here0
here1
here2


# Nearest Neighbors

In [1]:
# scaling

standard_scaler = sk.preprocessing.StandardScaler()
pca = sk.decomposition.PCA()
knn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)
pipeline = sk.pipeline.Pipeline(steps=[('standard_scaler', standard_scaler), ('pca', pca), ('knn', knn)])

# inner loop
scores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)
#print("Accuracy:", scores.mean()*100)

param_grid = {
    'pca__n_components': list(range(1, 14)),
    'knn__n_neighbors': list(range(1, 25))
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(features, labels)
print("best params: ", grid_search.best_params_)
#print("Accuracy: ", grid_search.best_score_*100)

# this does the nested loop
scores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)

print("Accuracy:", scores.mean()*100)


'\nstandard_scaler = sk.preprocessing.StandardScaler()\npca = sk.decomposition.PCA()\nknn = sk.neighbors.KNeighborsClassifier(n_neighbors=7)\npipeline = sk.pipeline.Pipeline(steps=[(\'standard_scaler\', standard_scaler), (\'pca\', pca), (\'knn\', knn)])\n\n# inner loop\nscores = sk.model_selection.cross_val_score(pipeline, features, labels, cv=5)\n#print("Accuracy:", scores.mean()*100)\n\nparam_grid = {\n    \'pca__n_components\': list(range(1, 14)),\n    \'knn__n_neighbors\': list(range(1, 25))\n}\n\ngrid_search = GridSearchCV(pipeline, param_grid, cv=5)\ngrid_search.fit(features, labels)\nprint("best params: ", grid_search.best_params_)\n#print("Accuracy: ", grid_search.best_score_*100)\n\n# this does the nested loop\nscores = sk.model_selection.cross_val_score(grid_search, features, labels, cv=5)\n\nprint("Accuracy:", scores.mean()*100)\n\n'

# Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

scaler_nn = sk.preprocessing.StandardScaler()
mlp_nn = MLPClassifier()
pipe_nn = sk.pipeline.Pipeline([('scale', scaler_nn), ('nn', mlp_nn)])
param_grid_nn = {
    'nn__hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
    'nn__activation': ['logistic', 'tanh', 'relu']
}
grid_nn = GridSearchCV(pipe_nn, param_grid_nn, cv=5)
pred_nn = cross_val_score(grid_nn, features, labels, cv=5)
print("Accuracy:", pred_nn.mean()*100)

# Ensemble Method 

In [None]:
# your code goes here
from sklearn.ensemble import RandomForestClassifier

params_rf = {"max_depth": list(range(35,56)), "min_samples_leaf": [8,10,12], "max_features": ['sqrt','log2']}

rf = sklearn.ensemble.RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, params_rf, cv=5)
pred_rf = sk.model_selection.cross_val_score(grid_search_rf, features, labels, cv=5)

print("Accuracy:", pred_rf.mean()*100)



from sklearn.ensemble import AdaBoostClassifier

boost_clf = AdaBoostClassifier(n_estimators = 150)

pred_boost = cross_val_score(boost_clf, dataX, dataY, cv=5)

print("Accuracy:", pred_boost.mean()*100)

# Predicting