### Imports

In [11]:
# general imports
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from collections import Counter

# Keras specific imports
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import SGD, Adadelta, Adagrad, Adam
from keras.layers.convolutional import Convolution2D
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import SReLU, ELU

### Data

In [2]:
! pwd

/home/ubuntu/Protein_data/Neural Networks


In [45]:
# # basic 5-spacing datta
# data = pd.read_csv('../fixed_five.csv')
# labels = pd.read_csv('../one_hot_labels.csv')

data = pd.read_csv('../csv_data/cleaned_ord_19.csv').drop('Unnamed: 0', axis=1)
labels = pd.read_csv('../one_hot_labels.csv')

In [46]:
data.head()

Unnamed: 0,pos,A-9,C-9,D-9,E-9,F-9,G-9,H-9,I-9,K-9,...,V9,W9,Y9,-9,H,E,T,S,B,Unnamed: 21
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [47]:
print data.shape, labels.shape

(134814, 406) (134815, 6)


In [48]:
labels.head()

Unnamed: 0,H,E,T,S,B,U
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,1,0,0,0,0,0
4,1,0,0,0,0,0


In [51]:
data = data.values
labels = labels.values[1:]

X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify=labels)

In [52]:
print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(606660, 406) (202224, 406) (606660, 6) (202224, 6)


### Network Architecture

In [53]:
model = Sequential()

In [54]:
# first layer, 200 nodes, BatchNormalized, ELU and Dropout
model.add(Dense(output_dim=600, input_dim=406))
model.add(BatchNormalization())
model.add(ELU(alpha=0.9))
model.add(Dropout(0.5))

# second layer, 200 nodes, BatchNormalized, ELU and Dropout
model.add(Dense(output_dim=600, input_dim=100))
model.add(BatchNormalization())
model.add(ELU(alpha=0.9))
model.add(Dropout(0.5))

# third layer, 6 nodes, BatchNormalized, SoftMax
model.add(Dense(input_dim=100, output_dim=6))
model.add(BatchNormalization())
model.add(Activation("softmax"))

In [55]:
sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)
opt = Adam()
model.compile(loss='categorical_crossentropy', optimizer=opt)

In [56]:
model.fit(X_train, y_train, nb_epoch=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f347bf94910>

In [57]:
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=32)



In [58]:
# loss for 5-spacing = 0.856
# loss for 19-spacing = 0.002
loss_and_metrics

0.0019896404137279616

In [59]:
preds = model.predict(X_test)

In [60]:
len(preds)

202224

In [61]:
print np.mean([1.0 if np.argmax(yi)==np.argmax(yp) else 0.0 for yi, yp in zip(y_test, preds)])

0.973381003244
