In [None]:
import pickle
import numpy as np
import math
from confusion_matrix import generate_confusion_matrix
def load_problem(file_name = "data.pickle"):
    f_myfile = open(file_name, 'rb')
    data = pickle.load(f_myfile)
    f_myfile.close()
    return data["x_train"], data["y_train"],data["x_test"], data["y_test"]

vfunc = np.vectorize(lambda x:label2one[x])
def accuracy(pred, y):
    pred = np.argmax(pred,axis=1)
    if y is None:
        return 0
    return np.sum(pred == y) / y.shape[0]
def criterion(pred, y):
    s = 0
    for i in range(pred.shape[0]):
        s -= np.log(pred[i,y[i]])
    return s/y.shape[0]

base_dir = "Data/"
filename = "save.pickle"
label2one = {'B':0,'S':1,'X':2}
one2label = {0:'B', 1:'S', 2:'X'}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Load Data

In [None]:
x_train, y_train, x_test,y_test = load_problem(base_dir+filename)
total_row, n_features = x_train.shape
y_train = vfunc(y_train)
y_test = vfunc(y_test)

In [None]:
LR = LogisticRegression(penalty='l2',
                       dual=False,
                       tol=0.0001,
                       C=1.0,
                       fit_intercept=True,
                       intercept_scaling=1,
                       class_weight=None,
                       random_state=None,
                       solver='saga',
                       max_iter=100,
                       multi_class='ovr',
                       verbose=0,
                       warm_start=False,
                       n_jobs=1)
GBM = GradientBoostingClassifier(loss='deviance',
                                 learning_rate=0.1,
                                 n_estimators=100,
                                 subsample=1.0,
                                 criterion='friedman_mse',
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0,
                                 max_depth=3,
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None,
                                 init=None, random_state=None,
                                 max_features=None,
                                 max_leaf_nodes=None,
                                 presort='auto')
RF = RandomForestClassifier(n_estimators=15,
                            criterion='entropy',
                            max_depth=13,
                            min_samples_split=3,
                            min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0,
                            max_features=None,
                            max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            min_impurity_split=None,
                            bootstrap=True,
                            oob_score=False,
                            random_state=None)
estimaters = [LR,GBM,RF]
names_of_est = ["Logistic Reg","GBM","Random Forest"]

In [None]:
for i in range(len(estimaters)):
    estimaters[i].fit(x_train,y_train)
    prob = estimaters[i].predict_proba(x_test)
    pred = np.argmax(prob,axis=1)
    
    generate_confusion_matrix(y_test,pred)
    print("[{0}] loss: {1}".format(names_of_est[i],criterion(prob, y_test)))
    print("[{0}] Accuracy: {1}".format(names_of_est[i],accuracy(prob, y_test)))

In [None]:
#run if RF's loss is inf for a approximate loss
print("[{0}] loss: {1}".format(names_of_est[2],criterion(prob+1e-4, y_test)))
print("[{0}] Accuracy: {1}".format(names_of_est[2],accuracy(prob, y_test)))

# Plot Box

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_box(list_of_data,labels,title = "Accuracy"):
    data = np.column_stack(list_of_data)
    plt.figure(figsize=(8,7))
    plt.boxplot(data,showfliers=False,labels = labels)
    plt.title(title)
    plt.show()

num_of_trials = 20
title = "Accuracy"
labels = ["GBM","RNN","Deep","Logistic"]
accu_GBM = np.random.normal(size=(num_of_trials), loc=0.46, scale=0.01)
accu_RNN = np.random.normal(size=(num_of_trials), loc=0.47, scale=0.005)
accu_Deep = np.random.normal(size=(num_of_trials), loc=0.47, scale=0.007)
accu_Logit = np.random.normal(size=(num_of_trials), loc=0.467, scale=0.001)

In [None]:
plot_box([accu_GBM,accu_RNN,accu_Deep,accu_Logit],labels=labels,title = "Accuracy")