In [1]:
import os
data_directory = "java_data_pickled/"
files = os.listdir(data_directory)
print(files[:1])
print(len(files))

['feature_99764.pkl']
107


In [2]:
# Original source from following
# Edited by Young Seok Kim

# http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
# 

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
def run_models(X, y):
    h = .02  # step size in the mesh
    names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
             "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
             "Naive Bayes", "QDA"]
    
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()
    ]    
    
    # Split test / train sets
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.3, random_state=42)
        
    # Normalize datasets only based on train set
    Normalizer = StandardScaler()
    Normalizer.fit(X_train)
    X_train = Normalizer.transform(X_train)
    X_test = Normalizer.transform(X_test)

    accuracies = {} # classifier_name : accuracy
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(name)
        print('accuracy : %.4f' % score)
        accuracies[name] = score
    
    return accuracies

In [4]:
%%time
import pickle

best_model = ""
best_accuracy = 0.0
for file in files:
    print('-------------------------------------------------')
    with open(data_directory + file, "rb") as f:
        print(file)
        X, y = pickle.load(f)
        accuracies = run_models(X,y)
        for model_name, accuracy in accuracies.items():
            if best_accuracy < accuracy :
                best_accuracy = accuracy
                best_model = file + " " + model_name
print("Done!")

-------------------------------------------------
feature_99764.pkl
Nearest Neighbors
accuracy : 0.8444
Linear SVM
accuracy : 0.4967
RBF SVM
accuracy : 0.8411
Gaussian Process
accuracy : 0.8510
Decision Tree
accuracy : 0.7500
Random Forest
accuracy : 0.6937
Neural Net
accuracy : 0.5811
AdaBoost
accuracy : 0.6871
Naive Bayes
accuracy : 0.5199
QDA
accuracy : 0.5513
-------------------------------------------------
feature_9474.pkl
Nearest Neighbors
accuracy : 0.8411
Linear SVM
accuracy : 0.5265
RBF SVM
accuracy : 0.8212
Gaussian Process
accuracy : 0.8228
Decision Tree
accuracy : 0.7417
Random Forest
accuracy : 0.6887
Neural Net
accuracy : 0.6424
AdaBoost
accuracy : 0.6474
Naive Bayes
accuracy : 0.5017
QDA
accuracy : 0.5861
-------------------------------------------------
feature_93083.pkl
Nearest Neighbors
accuracy : 0.8146
Linear SVM
accuracy : 0.5844
RBF SVM
accuracy : 0.8113
Gaussian Process
accuracy : 0.8228
Decision Tree
accuracy : 0.7152
Random Forest
accuracy : 0.7500
Neural Net


Gaussian Process
accuracy : 0.8990
Decision Tree
accuracy : 0.7351
Random Forest
accuracy : 0.7599
Neural Net
accuracy : 0.7119
AdaBoost
accuracy : 0.7649
Naive Bayes
accuracy : 0.6060
QDA
accuracy : 0.6457
-------------------------------------------------
feature_95922.pkl
Nearest Neighbors
accuracy : 0.8990
Linear SVM
accuracy : 0.6805
RBF SVM
accuracy : 0.8990
Gaussian Process
accuracy : 0.9007
Decision Tree
accuracy : 0.7550
Random Forest
accuracy : 0.7599
Neural Net
accuracy : 0.7136
AdaBoost
accuracy : 0.7583
Naive Bayes
accuracy : 0.6291
QDA
accuracy : 0.6722
-------------------------------------------------
feature_14784.pkl
Nearest Neighbors
accuracy : 0.8609
Linear SVM
accuracy : 0.6639
RBF SVM
accuracy : 0.8609
Gaussian Process
accuracy : 0.8692
Decision Tree
accuracy : 0.7732
Random Forest
accuracy : 0.7467
Neural Net
accuracy : 0.7401
AdaBoost
accuracy : 0.6970
Naive Bayes
accuracy : 0.6672
QDA
accuracy : 0.6987
-------------------------------------------------
feature_688

Gaussian Process
accuracy : 0.8891
Decision Tree
accuracy : 0.6904
Random Forest
accuracy : 0.7450
Neural Net
accuracy : 0.6954
AdaBoost
accuracy : 0.6755
Naive Bayes
accuracy : 0.6026
QDA
accuracy : 0.6325
-------------------------------------------------
feature_5245.pkl
Nearest Neighbors
accuracy : 0.8593
Linear SVM
accuracy : 0.5960
RBF SVM
accuracy : 0.8709
Gaussian Process
accuracy : 0.8692
Decision Tree
accuracy : 0.7632
Random Forest
accuracy : 0.7897
Neural Net
accuracy : 0.7384
AdaBoost
accuracy : 0.7285
Naive Bayes
accuracy : 0.5944
QDA
accuracy : 0.6589
-------------------------------------------------
feature_55127.pkl
Nearest Neighbors
accuracy : 0.9172
Linear SVM
accuracy : 0.6954
RBF SVM
accuracy : 0.9007
Gaussian Process
accuracy : 0.9172
Decision Tree
accuracy : 0.7666
Random Forest
accuracy : 0.8344
Neural Net
accuracy : 0.7616
AdaBoost
accuracy : 0.6175
Naive Bayes
accuracy : 0.6689
QDA
accuracy : 0.6921
-------------------------------------------------
feature_4231

Gaussian Process
accuracy : 0.9139
Decision Tree
accuracy : 0.7566
Random Forest
accuracy : 0.7715
Neural Net
accuracy : 0.7517
AdaBoost
accuracy : 0.7583
Naive Bayes
accuracy : 0.6805
QDA
accuracy : 0.7169
-------------------------------------------------
feature_23944.pkl
Nearest Neighbors
accuracy : 0.8228
Linear SVM
accuracy : 0.5116
RBF SVM
accuracy : 0.8013
Gaussian Process
accuracy : 0.8212
Decision Tree
accuracy : 0.7070
Random Forest
accuracy : 0.7434
Neural Net
accuracy : 0.5480
AdaBoost
accuracy : 0.4768
Naive Bayes
accuracy : 0.5265
QDA
accuracy : 0.5497
-------------------------------------------------
feature_44538.pkl
Nearest Neighbors
accuracy : 0.8560
Linear SVM
accuracy : 0.6275
RBF SVM
accuracy : 0.8493
Gaussian Process
accuracy : 0.8609
Decision Tree
accuracy : 0.7020
Random Forest
accuracy : 0.7384
Neural Net
accuracy : 0.6755
AdaBoost
accuracy : 0.6623
Naive Bayes
accuracy : 0.6076
QDA
accuracy : 0.6391
-------------------------------------------------
feature_598

Gaussian Process
accuracy : 0.8328
Decision Tree
accuracy : 0.6987
Random Forest
accuracy : 0.7550
Neural Net
accuracy : 0.6871
AdaBoost
accuracy : 0.6805
Naive Bayes
accuracy : 0.6225
QDA
accuracy : 0.6457
-------------------------------------------------
feature_35905.pkl
Nearest Neighbors
accuracy : 0.9023
Linear SVM
accuracy : 0.6424
RBF SVM
accuracy : 0.9056
Gaussian Process
accuracy : 0.9073
Decision Tree
accuracy : 0.7632
Random Forest
accuracy : 0.7831
Neural Net
accuracy : 0.7252
AdaBoost
accuracy : 0.7384
Naive Bayes
accuracy : 0.6109
QDA
accuracy : 0.6507
-------------------------------------------------
feature_98691.pkl
Nearest Neighbors
accuracy : 0.8924
Linear SVM
accuracy : 0.6921
RBF SVM
accuracy : 0.8825
Gaussian Process
accuracy : 0.8974
Decision Tree
accuracy : 0.7219
Random Forest
accuracy : 0.7914
Neural Net
accuracy : 0.7715
AdaBoost
accuracy : 0.7003
Naive Bayes
accuracy : 0.6639
QDA
accuracy : 0.6871
-------------------------------------------------
feature_204

  " state: %s" % convergence_dict)


Gaussian Process
accuracy : 0.2682
Decision Tree
accuracy : 0.7583
Random Forest
accuracy : 0.7384
Neural Net
accuracy : 0.6937
AdaBoost
accuracy : 0.5033
Naive Bayes
accuracy : 0.6175
QDA
accuracy : 0.6374
-------------------------------------------------
feature_81693.pkl
Nearest Neighbors
accuracy : 0.8907
Linear SVM
accuracy : 0.6838
RBF SVM
accuracy : 0.8874
Gaussian Process
accuracy : 0.8957
Decision Tree
accuracy : 0.7351
Random Forest
accuracy : 0.8096
Neural Net
accuracy : 0.7467
AdaBoost
accuracy : 0.6043
Naive Bayes
accuracy : 0.6639
QDA
accuracy : 0.6838
-------------------------------------------------
feature_93052.pkl
Nearest Neighbors
accuracy : 0.9089
Linear SVM
accuracy : 0.7103
RBF SVM
accuracy : 0.9056
Gaussian Process
accuracy : 0.9139
Decision Tree
accuracy : 0.7997
Random Forest
accuracy : 0.8046
Neural Net
accuracy : 0.7566
AdaBoost
accuracy : 0.5099
Naive Bayes
accuracy : 0.6821
QDA
accuracy : 0.6970
-------------------------------------------------
feature_342

In [6]:
print(best_accuracy)
print(best_model)

0.9172185430463576
feature_87086.pkl Nearest Neighbors
