In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.preprocessing import text

Using TensorFlow backend.


In [2]:
# load data
df_train = pd.read_csv("train.csv.gz")
df_test = pd.read_csv("test.csv.gz")
test_idx = df_train.shape[0]
df_train_gap = df_train.gap
df_test_ids = df_test.Id
df_train = df_train.drop(['gap'], axis=1)
df_test = df_test.drop(['Id'], axis=1)
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# convert SMILES strings to integer arrays
smiles = list(df_train.smiles) + list(df_test.smiles)
tokenizer = text.Tokenizer(filters='', lower=False, char_level=True)
tokenizer.fit_on_texts(smiles)
smile_nums = tokenizer.texts_to_sequences(smiles)

In [8]:
# print SMILES sizes
lens = [len(a) for a in smile_nums]
print "Min Length: ", min(lens), "\nAvg Length: ", round(np.mean(lens),1), "\nMax Length: ", max(lens)

Min Length:  22 
Avg Length:  50.7 
Max Length:  81


In [19]:
# train-test split
smile_nums = smile_nums[:test_idx]
smile_test = smile_nums[test_idx:]

In [6]:
# generator for data
def batch_generator(batch_samples, num_batches, window_size=20):
    for batch_num in range(num_batches):
        X_batch, y_batch = [], []
        for e,smile_num in enumerate(smile_nums[batch_num*batch_samples:(batch_num+1)*batch_samples]):
            for i in range(len(smile_num)-window_size):
                X_batch.append(smile_num[i:i+window_size])
                y_batch.append(df_train_gap[e])
        X_batch, y_batch = np.array(X_batch), np.array(y_batch)
        X_batch = np_utils.to_categorical(X_batch)
        yield X_batch, y_batch

In [7]:
# define the LSTM model
window_size = 20
num_chars = len(chars2idx.keys())

model = Sequential()
model.add(LSTM(256, input_shape=(window_size, num_chars)))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               284672    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                4112      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 288,801
Trainable params: 288,801
Non-trainable params: 0
_________________________________________________________________


In [9]:
# train
nb_epoch = 1
loss, val_loss = [], []
for e in range(nb_epoch):
    print "\n\nEPOCH: ", e+1, "/", nb_epoch
    for X_batch, y_batch in batch_generator(batch_samples=1000, num_batches=20): 
        history = model.fit(X_batch, y_batch, batch_size=32, epochs=1, validation_split=.25)
        loss.append(history.history['loss'])
        val_loss.append(history.history['val_loss'])
model.save('lstm.h5')



EPOCH:  1 / 1
Train on 23148 samples, validate on 7716 samples
Epoch 1/1
Train on 23022 samples, validate on 7674 samples
Epoch 1/1
Train on 22804 samples, validate on 7602 samples
Epoch 1/1
Train on 22859 samples, validate on 7620 samples
Epoch 1/1
Train on 23196 samples, validate on 7733 samples
Epoch 1/1
Train on 23313 samples, validate on 7772 samples
Epoch 1/1
Train on 22879 samples, validate on 7627 samples
Epoch 1/1
Train on 23000 samples, validate on 7667 samples
Epoch 1/1
Train on 23166 samples, validate on 7723 samples
Epoch 1/1
Train on 22907 samples, validate on 7636 samples
Epoch 1/1
Train on 22970 samples, validate on 7657 samples
Epoch 1/1
Train on 23004 samples, validate on 7668 samples
Epoch 1/1
Train on 22970 samples, validate on 7657 samples
Epoch 1/1
Train on 22981 samples, validate on 7661 samples
Epoch 1/1
Train on 22968 samples, validate on 7657 samples
Epoch 1/1
Train on 22812 samples, validate on 7605 samples
Epoch 1/1
Train on 23071 samples, validate on 7691

In [17]:
# test set generator
def test_generator(window_size=20):
    for smile in smile_test:
        dataX = [smile[i:i+window_size] for i in range(len(smile)-window_size)]
        yield dataX

In [18]:
# predict
predictions = [np.median(model.predict(smile)) for smile in test_generator()]

ValueError: Error when checking : expected lstm_1_input to have shape (None, 20, 21) but got array with shape (31, 20, 19)

In [None]:
# write to file
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")
            
write_to_file('Predictions/test.csv', predictions)