## Import libraries and get data as letters:

In [5]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score

x_train = pd.read_csv('train.csv')

x_train = x_train[0:10000]

letters = x_train['Sequence'].apply(lambda x: pd.Series(list(x)))
x_train = x_train.drop('Sequence', axis = 1)
x_train = x_train.join(letters)
print(x_train)

#x_test = pd.read_csv('test.csv')
#letters = x_test['Sequence'].apply(lambda x: pd.Series(list(x)))
#x_test = x_test.drop('Sequence', axis = 1)
#x_test = x_test.join(letters)

      Active  0  1  2  3
0          0  D  K  W  L
1          0  F  C  H  N
2          0  K  D  Q  P
3          0  F  N  W  I
4          0  N  K  R  M
...      ... .. .. .. ..
9995       0  C  P  M  Q
9996       0  W  T  P  Q
9997       0  N  T  E  S
9998       0  F  A  L  D
9999       0  F  S  G  Q

[10000 rows x 5 columns]


## One hot encoding:

In [10]:
x_train_oh = pd.get_dummies(data=x_train, columns=[0, 1, 2, 3], sparse=True)
print(x_train_oh)

#x_test_oh = pd.get_dummies(data=x_test, columns=[0, 1, 2, 3], sparse=True)

      Active  0_A  0_C  0_D  0_E  0_F  0_G  0_H  0_I  0_K  ...  3_M  3_N  3_P  \
0          0    0    0    1    0    0    0    0    0    0  ...    0    0    0   
1          0    0    0    0    0    1    0    0    0    0  ...    0    1    0   
2          0    0    0    0    0    0    0    0    0    1  ...    0    0    1   
3          0    0    0    0    0    1    0    0    0    0  ...    0    0    0   
4          0    0    0    0    0    0    0    0    0    0  ...    1    0    0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
9995       0    0    1    0    0    0    0    0    0    0  ...    0    0    0   
9996       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
9997       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
9998       0    0    0    0    0    1    0    0    0    0  ...    0    0    0   
9999       0    0    0    0    0    1    0    0    0    0  ...    0    0    0   

      3_Q  3_R  3_S  3_T  3

In [23]:
# Implement Neural network <-- there is documentation in scikit
clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(90,90,90), random_state=1, activation='tanh', max_iter=200, learning_rate='adaptive')
#clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
#pred = clf.predict(x_train_oh.iloc[:, 1:])
#print(pred)



In [24]:
# Maybe split into training and validating sets?

### CV F1 score:

In [25]:
kf = KFold(n_splits=5)
f1_scores = []
for train_indices, test_indices in kf.split(x_train):
    clf.fit(x_train_oh.iloc[train_indices, 1:], x_train_oh.iloc[train_indices, 0])
    pred = clf.predict(x_train_oh.iloc[test_indices, 1:])
    #print(clf.score(x_train_oh.iloc[test_indices, 1:], x_train_oh.iloc[test_indices, 0]))
    f1_scores.append(f1_score(pred, x_train_oh.iloc[test_indices, 0]))
    print(f1_score(pred, x_train_oh.iloc[test_indices, 0]))
print("Mean F1 score:", np.array(f1_scores).mean())

0.7007299270072992
0.7234042553191489
0.7101449275362318
0.7887323943661971
0.6967741935483872
Mean F1 score: 0.7239571395554528


In [None]:
# TODO: plot convergence of solver

### Search to find parameters that give best F1 score:

In [28]:
mlp = MLPClassifier(random_state=1, max_iter=100, solver='adam', activation='tanh')

# All parameters we want to try:
parameter_space = {
    'hidden_layer_sizes': [(100,100), (100, 100, 100), (1000, 1000)],
    #'activation': ['tanh', 'relu', 'logistic'],
    #'solver': ['sgd', 'adam'],
    'alpha': [1e-5, 1e-4, 1e-3],
}

# Do grid search over all parameter options:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=4, scoring='f1')
clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
print('Parameters', clf.best_params_, "gives best score:", clf.best_score_)
print(clf.cv_results_)

KeyboardInterrupt: 

In [None]:
#  "the optimal size of the hidden layer is usually between the size of the input and size of the output layers"
#  "number of neurons = 0.005 * number of samples?"

# Easy: 0.607427055703
# Medium: 0.852643419573
# Hard: 0.89591280654