In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import random as rand
import sys
import collections
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
from sklearn.decomposition import PCA
#from sklearn.metrics.cluster import fowlkes_mallows_score
import seaborn as sn
import joblib
import re
import csv
import datetime
import time

In [2]:
def split_train_test(train_data,test_data, pca=False):
    '''
    Will return arrays of these sizes:
        X_train : (752494, 41)
        y_train : (752494, )

        X_test : (322498, 41)
        y_test : (322498, )
    '''

    # convert all but last column to list of lists for data
    X_train = np.array(train_data.iloc[:,:-1].values.tolist())
    X_train = StandardScaler().fit_transform(X_train)  # mean of ~0 and variance of 1
    # convert last column to list for labels
    y_train = np.array(train_data.iloc[:,-1].values.tolist())

    # convert all but last column to list of lists for data
    X_test = np.array(test_data.iloc[:,:-1].values.tolist())
    X_test = StandardScaler().fit_transform(X_test)  # mean of ~0 and variance of 1
    # convert last column to list for labels
    y_test = np.array(test_data.iloc[:,-1].values.tolist())
    
    
    # pca is optional
    desc = ""
    if pca:
        X_train,X_test = pca_data(X_train,X_test)
        desc = "_PCA"

    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)
    np.save("X_test%s.npy" % desc, X_test)
   # np.save("X_test%s.npy" % desc, X_test)
    np.save("y_test%s.npy" % desc, y_test)
   # np.save("y_test%s.npy" % desc, y_test)
    return X_train, y_train, X_test, y_test

In [10]:
def pca_data(X_train,X_test):
    pca = PCA()
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    #get variance explained
    explained_variance = pca.explained_variance_ratio_

    #select what percent var to keep
    desired_var = 0.9  # try out different values for this, make graph
    #select how many eigenvalues to keep
    cumsum = np.cumsum(explained_variance)
    k = np.argwhere(cumsum > desired_var)[0]

    pca = PCA(n_components=int(k))
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    return X_train, X_test

In [11]:
def main():
    # assume that if x_train doesn't exist, then none of the sets exist
    pca = True
    if (not os.path.exists('x_test.npy') and not pca) or (not os.path.exists("X_test_PCA.npy") and pca):
        train_data = pd.read_csv('../data/train_nodup.csv')
        test_data = pd.read_csv('../data/test_nodup.csv')
        X_train, y_train, X_test, y_test = split_train_test(train_data,test_data, pca=pca)
        print("Transformed data")
    else:
        desc = ""
        if pca:
            desc = "_PCA"
        X_train = np.load("X_train%s.npy" % desc)
        X_test = np.load("X_test%s.npy" % desc)
        y_train = np.load("y_train%s.npy" % desc)
        y_test = np.load("y_test%s.npy" % desc)
        print("Loaded in data")

        
    print(len(X_train))
    print(len(y_train))
    print(X_train[0])
    print(y_train[0])
    
    print(len(X_test))
    print(len(y_test))
    print(X_test[0])
    print(y_test[0])
#    result = knn(X_train,X_test,y_train,y_test)
#    print("KNN F1-Score: %lf" %(result))
#    result = dt(X_train,X_test,y_train,y_test)
#    print("Decision Tree F1-Score: %lf" %(result))  
#    start = time.time()
#    result = sgd(X_train,X_test,y_train,y_test)
#    print("SGD Classifier (SVM w/ SGD training) F1-Score: %lf" %(result))
#    print("SGD Time: %.3lf" % (time.time() - start))
#    result = mlp(X_train,X_test,y_train,y_test)
#    print("MLP F1-Score: %lf" %(result))
#    result = gnb(X_train,X_test,y_train,y_test)
#    print("Naive-Bayes F1-Score: %lf" %(result))
#    result = rf(X_train,X_test,y_train,y_test)
#    print("Random Forest F1-Score: %lf" %(result))
#    start = time.time()
#    result = linSVC(X_train,X_test,y_train,y_test)
#    print("Linear SVC F1-Score: %lf" %(result))
#    print("SGD Time: %.3lf" % (time.time() - start))

In [12]:
main()

Transformed data
1074992
1074992
[ 5.75126242e+00 -1.70509318e+00 -5.33800714e-01  4.09963237e-01
 -8.41918805e-02 -9.81417699e-03  2.83175681e-01 -2.55221073e-01
 -7.92219459e-02 -4.84546833e-02  1.83756603e-02  6.50642274e-03
  9.54654281e-03 -5.26766418e-03 -3.27437041e-02 -6.56413875e-02
 -1.92198816e-02  1.26038534e-01  2.43348913e-01 -8.01061798e-02]
neptune
77291
77291
[-0.8918304  -0.93719466  1.09077414 -1.00292607 -0.47380975  1.39790852
 -0.25914386  0.35980159  0.46438818 -0.49207928  0.20906435 -0.063602
 -0.1026246   0.03688954  0.12224632  0.09845588  0.12020273  1.12370159
 -1.33548419  0.26763392]
normal


## Decision Tree

In [5]:
def dt(X_train,X_test,y_train,y_test):
    classifier = DecisionTreeClassifier(random_state=0)
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

------------------------

## Random Forest (Original Data)

In [6]:
def rf(X_train,X_test,y_train,y_test):
    RFclas = RandomForestClassifier(max_depth=10, random_state=0,n_estimators=100)
    RFclas.fit(X_train,y_train)
    y_pred = RFclas.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

-----------

## SGDClassifier (Linear SVC with SGD training)

In [5]:
def sgd(X_train,X_test,y_train,y_test):
    clf = SGDClassifier()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

----------

## MLP(NN) (Original Data) -- Note: np.abs() needs to be used on prediction because it guesses negative values sometimes... oh well

In [8]:
def mlp(X_train,X_test,y_train,y_test):
    bpnn = MLPClassifier(max_iter = 50000) #Very basic BPNN/MLP
    bpnn.fit(X_train,y_train)
    y_pred = bpnn.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

-------

## kNN 

In [9]:
def knn(X_train,X_test,y_train,y_test):
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

--------

## Linear SVC

In [4]:
def linSVC(X_train,X_test,y_train,y_test):    
    svc = LinearSVC(random_state=0)
    svc.fit(X_train,y_train)
    y_pred = svc.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score

--------

## Gaussian Naive-Bayes

In [11]:
def gnb(X_train,X_test,y_train,y_test): 
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred, average='weighted')
    return f1_score