In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import random as rand
import sys
import collections
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
from sklearn.decomposition import PCA
#from sklearn.metrics.cluster import fowlkes_mallows_score
import seaborn as sn
import joblib
import re
import csv
import datetime
import time

In [2]:
def split_train_test(data, pca=False):
    '''
    Will return arrays of these sizes:
        X_train : (752494, 41)
        y_train : (752494, )

        X_test : (322498, 41)
        y_test : (322498, )
    '''

    # convert all but last column to list of lists for data
    X = np.array(data.iloc[:,:-1].values.tolist())
    X = StandardScaler().fit_transform(X)  # mean of ~0 and variance of 1
    # convert last column to list for labels
    y = np.array(data.iloc[:,-1].values.tolist())

    # pca is optional
    desc = ""
    if pca:
        X = pca_data(X)
        desc = "_PCA"

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)
    np.save("X_train%s.npy" % desc, X_train)
    np.save("X_test%s.npy" % desc, X_test)
    np.save("y_train%s.npy" % desc, y_train)
    np.save("y_test%s.npy" % desc, y_test)
    return X_train, X_test, y_train, y_test

In [3]:
def pca_data(X):
    pca = PCA()
    X = pca.fit_transform(X)

    #get variance explained
    explained_variance = pca.explained_variance_ratio_
    '''
    #make first plot of just principal components
    fig1 = plt.figure()
    plt.plot(explained_variance)
    plt.title("Principal Components")
    plt.ylabel("Percent of Variance Explained")
    plt.xlabel("Principal Component")
    plt.savefig("graphs/principal_comp_.png")
    '''
    #select what percent var to keep
    desired_var = 0.9  # try out different values for this, make graph
    #select how many eigenvalues to keep
    cumsum = np.cumsum(explained_variance)
    k = np.argwhere(cumsum > desired_var)[0]
    '''
    #make second plot of cum var explained
    fig2 = plt.figure()
    plt.plot(cumsum)
    plt.title("Variance Explained")
    plt.plot(k, cumsum[k], 'ro', label="Eigenvalue #%d with %.2f Variance" % (k, desired_var))
    plt.legend()
    plt.ylabel("Cumulative Percent of Variance Explained")
    plt.xlabel("Principal Component")
    plt.savefig("graphs/var_exp_.png")
    '''
    pca = PCA(n_components=int(k))
    X = pca.fit_transform(X)
    return X

In [15]:
def main():
    # assume that if x_train doesn't exist, then none of the sets exist
    pca = True
    if (not os.path.exists('x_train.npy') and not pca) or (not os.path.exists("X_train_PCA.npy") and pca):
        data = pd.read_csv('../data/train_nodup.csv')
        X_train, X_test, y_train, y_test = split_train_test(data, pca=pca)
        print("Transformed data")
    else:
        desc = ""
        if pca:
            desc = "_PCA"
        X_train = np.load("X_train%s.npy" % desc)
        X_test = np.load("X_test%s.npy" % desc)
        y_train = np.load("y_train%s.npy" % desc)
        y_test = np.load("y_test%s.npy" % desc)
        print("Loaded in data")

#    result = knn(X_train,X_test,y_train,y_test)
#    print("KNN F1-Score: %lf" %(result))
#    result = dt(X_train,X_test,y_train,y_test)
#    print("Decision Tree F1-Score: %lf" %(result))  
    start = time.time()
    result = sgd(X_train,X_test,y_train,y_test)
    print("SGD Classifier (SVM w/ SGD training) F1-Score: %lf" %(result))
    print("SGD Time: %.3lf" % (time.time() - start))
#    result = mlp(X_train,X_test,y_train,y_test)
#    print("MLP F1-Score: %lf" %(result))
#    result = gnb(X_train,X_test,y_train,y_test)
#    print("Naive-Bayes F1-Score: %lf" %(result))
#    result = rf(X_train,X_test,y_train,y_test)
#    print("Random Forest F1-Score: %lf" %(result))
    start = time.time()
    result = linSVC(X_train,X_test,y_train,y_test)
    print("Linear SVC F1-Score: %lf" %(result))
    print("SGD Time: %.3lf" % (time.time() - start))

In [None]:
main()

Loaded in data
SGD Classifier (SVM w/ SGD training) F1-Score: 0.987635
SGD Time: 30.137


## Decision Tree

In [5]:
def dt(X_train,X_test,y_train,y_test):
    classifier = DecisionTreeClassifier(random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

------------------------

## Random Forest (Original Data)

In [6]:
def rf(X_train,X_test,y_train,y_test):
    RFclas = RandomForestClassifier(max_depth=10, random_state=0,n_estimators=100)
    RFclas.fit(X_train,y_train)
    y_pred = RFclas.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

-----------

## SGDClassifier (Linear SVC with SGD training)

In [5]:
def sgd(X_train,X_test,y_train,y_test):
    clf = SGDClassifier()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

----------

## MLP(NN) (Original Data) -- Note: np.abs() needs to be used on prediction because it guesses negative values sometimes... oh well

In [8]:
def mlp(X_train,X_test,y_train,y_test):
    bpnn = MLPClassifier(max_iter = 50000) #Very basic BPNN/MLP
    bpnn.fit(X_train,y_train)
    y_pred = bpnn.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

-------

## kNN 

In [9]:
def knn(X_train,X_test,y_train,y_test):
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

--------

## Linear SVC

In [4]:
def linSVC(X_train,X_test,y_train,y_test):    
    svc = LinearSVC(random_state=0)
    svc.fit(X_train,y_train)
    y_pred = svc.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

--------

## Gaussian Naive-Bayes

In [11]:
def gnb(X_train,X_test,y_train,y_test): 
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score