## Import libraries and get data as letters:

In [None]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score

x_train = pd.read_csv('train.csv')

#x_train = x_train[0:10000]

letters = x_train['Sequence'].apply(lambda x: pd.Series(list(x)))
x_train = x_train.drop('Sequence', axis = 1)
x_train = x_train.join(letters)
print(x_train)

x_test = pd.read_csv('test.csv')
letters = x_test['Sequence'].apply(lambda x: pd.Series(list(x)))
x_test = x_test.drop('Sequence', axis = 1)
x_test = x_test.join(letters)
print(x_test)

## One hot encoding:

In [None]:
x_train_oh = pd.get_dummies(data=x_train, columns=[0, 1, 2, 3], sparse=True)
print(x_train_oh)

x_test_oh = pd.get_dummies(data=x_test, columns=[0, 1, 2, 3], sparse=True)
print(x_test_oh)

In [90]:
# TODO: fix unbalanced data for better performance. Easiest is to oversample the 1's. Its worth a shot at least

112000

In [None]:
# Implement Neural network <-- there is documentation in scikit
l1_size = int(0.006 * x_train_oh.shape[0])
l2_size = int(0.008 * x_train_oh.shape[0])
clf = MLPClassifier(solver='adam', alpha=1e-3, hidden_layer_sizes=(l1_size, l2_size), random_state=1, activation='tanh', max_iter=200)
#clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
#pred = clf.predict(x_test_oh)
#print(pred)

In [83]:
# Maybe split into training and validating sets?
#df = pd.DataFrame(pred[1:], columns=[1]) 
#print(df)
#df.to_csv('out.csv', index=False)

### CV F1 score:

In [None]:
kf = KFold(n_splits=5)
f1_scores = []
for train_indices, test_indices in kf.split(x_train):
    clf.fit(x_train_oh.iloc[train_indices, 1:], x_train_oh.iloc[train_indices, 0])
    pred = clf.predict(x_train_oh.iloc[test_indices, 1:])
    #print(clf.score(x_train_oh.iloc[test_indices, 1:], x_train_oh.iloc[test_indices, 0]))
    f1_scores.append(f1_score(pred, x_train_oh.iloc[test_indices, 0]))
    print(f1_score(pred, x_train_oh.iloc[test_indices, 0]))
print("Mean F1 score:", np.array(f1_scores).mean())

In [None]:
# TODO: plot convergence of solver

### Search to find parameters that give best F1 score:

In [53]:
mlp = MLPClassifier(random_state=1, alpha=1e-3, max_iter=100, solver='adam', activation='tanh')

# All parameters we want to try:
parameter_space = {
    'hidden_layer_sizes': [(60, 80), (50, 80), (70, 80), (60, 70), (60, 90)],
    #'activation': ['tanh', 'relu', 'logistic'],
    #'solver': ['sgd', 'adam'],
    #'alpha': [1e-3],
}

# 10000 * 0.006

# Do grid search over all parameter options:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=4, scoring='f1')
clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
print('Parameters', clf.best_params_, "gives best score:", clf.best_score_)
print(clf.cv_results_.mean_test_score)

Parameters {'hidden_layer_sizes': (60, 80)} gives best score: 0.7540308323707317
{'mean_fit_time': array([19.00803643, 19.31100678, 15.98581493, 17.03055614, 22.3140865 ]), 'std_fit_time': array([0.28902677, 0.14915076, 0.19945387, 0.10223015, 0.12321142]), 'mean_score_time': array([0.15239179, 0.03189653, 0.0651148 , 0.03495723, 0.04025084]), 'std_score_time': array([0.13857244, 0.00214542, 0.01033127, 0.00980135, 0.02924387]), 'param_hidden_layer_sizes': masked_array(data=[(60, 80), (50, 80), (70, 80), (60, 70), (60, 90)],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'hidden_layer_sizes': (60, 80)}, {'hidden_layer_sizes': (50, 80)}, {'hidden_layer_sizes': (70, 80)}, {'hidden_layer_sizes': (60, 70)}, {'hidden_layer_sizes': (60, 90)}], 'split0_test_score': array([0.74725275, 0.63783784, 0.68478261, 0.73142857, 0.73195876]), 'split1_test_score': array([0.78651685, 0.74576271, 0.74576271, 0.75449102, 0.77837838]), 's



In [None]:
#  "the optimal size of the hidden layer is usually between the size of the input and size of the output layers"
#  "number of neurons = 0.005 * number of samples?" 2/3 size of input is also usual

# Easy: 0.607427055703
# Medium: 0.852643419573
# Hard: 0.89591280654