## Import libraries and get data as letters:

In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score

test = False

x_train = pd.read_csv('train.csv')

x_train = x_train[0:60000]

letters = x_train['Sequence'].apply(lambda x: pd.Series(list(x)))
x_train = x_train.drop('Sequence', axis = 1)
x_train = x_train.join(letters)
print(x_train)

if test:
    x_test = pd.read_csv('test.csv')
    letters = x_test['Sequence'].apply(lambda x: pd.Series(list(x)))
    x_test = x_test.drop('Sequence', axis = 1)
    x_test = x_test.join(letters)
    print(x_test)

       Active  0  1  2  3
0           0  D  K  W  L
1           0  F  C  H  N
2           0  K  D  Q  P
3           0  F  N  W  I
4           0  N  K  R  M
...       ... .. .. .. ..
59995       0  A  H  S  P
59996       0  T  G  E  L
59997       0  H  C  N  C
59998       0  G  D  E  E
59999       0  F  I  N  M

[60000 rows x 5 columns]


## One hot encoding:

In [2]:
x_train_oh = pd.get_dummies(data=x_train, columns=[0, 1, 2, 3], sparse=True)
print(x_train_oh)

if test:
    x_test_oh = pd.get_dummies(data=x_test, columns=[0, 1, 2, 3], sparse=True)
    print(x_test_oh)

       Active  0_A  0_C  0_D  0_E  0_F  0_G  0_H  0_I  0_K  ...  3_M  3_N  \
0           0    0    0    1    0    0    0    0    0    0  ...    0    0   
1           0    0    0    0    0    1    0    0    0    0  ...    0    1   
2           0    0    0    0    0    0    0    0    0    1  ...    0    0   
3           0    0    0    0    0    1    0    0    0    0  ...    0    0   
4           0    0    0    0    0    0    0    0    0    0  ...    1    0   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
59995       0    1    0    0    0    0    0    0    0    0  ...    0    0   
59996       0    0    0    0    0    0    0    0    0    0  ...    0    0   
59997       0    0    0    0    0    0    0    1    0    0  ...    0    0   
59998       0    0    0    0    0    0    1    0    0    0  ...    0    0   
59999       0    0    0    0    0    1    0    0    0    0  ...    1    0   

       3_P  3_Q  3_R  3_S  3_T  3_V  3_W  3_Y  
0        0    0    0    0  

In [4]:
# TODO: fix unbalanced data for better performance. Easiest is to oversample the 1's. Its worth a shot at least

In [3]:
# Implement Neural network <-- there is documentation in scikit
l1_size = int(0.003 * x_train_oh.shape[0])
l2_size = int(0.005 * x_train_oh.shape[0])
clf = MLPClassifier(solver='adam', alpha=8e-4, verbose=10, hidden_layer_sizes=(l1_size,l2_size), random_state=1, activation='tanh', max_iter=400)


### CV F1 score:

In [4]:
kf = KFold(n_splits=5)
f1_scores = []
for train_indices, test_indices in kf.split(x_train):
    clf.fit(x_train_oh.iloc[train_indices, 1:], x_train_oh.iloc[train_indices, 0])
    pred = clf.predict(x_train_oh.iloc[test_indices, 1:])
    #print(clf.score(x_train_oh.iloc[test_indices, 1:], x_train_oh.iloc[test_indices, 0]))
    f1_scores.append(f1_score(pred, x_train_oh.iloc[test_indices, 0]))
    print(f1_score(pred, x_train_oh.iloc[test_indices, 0]))
print("Mean F1 score:", np.array(f1_scores).mean())

Iteration 1, loss = 0.09410011
Iteration 2, loss = 0.06921685
Iteration 3, loss = 0.06852413
Iteration 4, loss = 0.06713904
Iteration 5, loss = 0.06482222
Iteration 6, loss = 0.05972665
Iteration 7, loss = 0.05309142
Iteration 8, loss = 0.04733377
Iteration 9, loss = 0.04381315
Iteration 10, loss = 0.04171600
Iteration 11, loss = 0.04034404
Iteration 12, loss = 0.03908368
Iteration 13, loss = 0.03804557
Iteration 14, loss = 0.03647994
Iteration 15, loss = 0.03511197
Iteration 16, loss = 0.03407346
Iteration 17, loss = 0.03227091
Iteration 18, loss = 0.03136344
Iteration 19, loss = 0.02979605
Iteration 20, loss = 0.02928689
Iteration 21, loss = 0.02808438
Iteration 22, loss = 0.02701508
Iteration 23, loss = 0.02606442
Iteration 24, loss = 0.02488144
Iteration 25, loss = 0.02378730
Iteration 26, loss = 0.02311449
Iteration 27, loss = 0.02252202
Iteration 28, loss = 0.02146177
Iteration 29, loss = 0.02036859
Iteration 30, loss = 0.01992482
Iteration 31, loss = 0.01902458
Iteration 32, los

In [None]:
# TODO: plot convergence of solver

#(0.005, 0.005) -> 0.86497
#(0.005, 0.006) -> 0.87049
#(0.005, 0.007) -> 0.86912
#(6, 6) -> 0.86409
#(4,6) -> 0.86950
#(4,5) -> 0.87112

######

#(3,5) -> 0.8667

### Predict on test data:

In [54]:
clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
pred = clf.predict(x_test_oh)
print(pred)

[0 0 0 ... 0 0 0]


In [55]:
df = pd.DataFrame(pred) 
print(df)
df.to_csv('out.csv', index=False, header=False)

       0
0      0
1      0
2      0
3      0
4      0
...   ..
47995  0
47996  0
47997  0
47998  0
47999  0

[48000 rows x 1 columns]


In [14]:
print(df[0:10])
print(pred[0:10])

   0
0  0
1  0
2  0
3  0
4  0
5  1
6  0
7  0
8  0
9  0
[0 0 0 0 0 1 0 0 0 0]


### Search to find parameters that give best F1 score:

In [None]:
l1_size = int(0.005 * x_train_oh.shape[0])
l2_size = int(0.005 * x_train_oh.shape[0])

mlp = MLPClassifier(random_state=1, alpha=8e-4, max_iter=200, solver='adam', activation='tanh', verbose=10)

# All parameters we want to try:
parameter_space = {
    'hidden_layer_sizes': [(int(0.003 * x_train_oh.shape[0]), int(0.003 * x_train_oh.shape[0])),
                           (int(0.002 * x_train_oh.shape[0]), int(0.003 * x_train_oh.shape[0])),
                           (int(0.002 * x_train_oh.shape[0]), int(0.002 * x_train_oh.shape[0])),
                           (int(0.003 * x_train_oh.shape[0]), int(0.002 * x_train_oh.shape[0]))],
    #'activation': ['tanh', 'relu', 'logistic'],
    #'solver': ['sgd', 'adam'],
    #'alpha': [1e-3, 6e-4, 8e-4],
}

# Do grid search over all parameter options:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5, scoring='f1', verbose=10)
clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
print('Parameters', clf.best_params_, "gives best score:", clf.best_score_)
print(clf.cv_results_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  9.4min


In [46]:
#  "the optimal size of the hidden layer is usually between the size of the input and size of the output layers"
#  "number of neurons = 0.005 * number of samples?" 2/3 size of input is also usual

# Easy: 0.607427055703
# Medium: 0.852643419573
# Hard: 0.89591280654

In [None]:
l1_size = int(0.005 * x_train_oh.shape[0])
l2_size = int(0.005 * x_train_oh.shape[0])

mlp = MLPClassifier(random_state=1, alpha=8e-4, max_iter=200, solver='adam', activation='tanh', verbose=10)

# All parameters we want to try:
parameter_space = {
    'hidden_layer_sizes': [(int(0.004 * x_train_oh.shape[0]), int(0.006 * x_train_oh.shape[0])),
                           (int(0.006 * x_train_oh.shape[0]), int(0.008 * x_train_oh.shape[0])),],
    #'activation': ['tanh', 'relu', 'logistic'],
    #'solver': ['sgd', 'adam'],
    #'alpha': [1e-3, 6e-4, 8e-4],
}

# Do grid search over all parameter options:
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5, scoring='f1', verbose=10)
clf.fit(x_train_oh.iloc[:, 1:], x_train_oh['Active'])
print('Parameters', clf.best_params_, "gives best score:", clf.best_score_)
print(clf.cv_results_)