In [75]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [114]:
# load data
df_train = pd.read_csv("train.csv.gz")
df_test = pd.read_csv("test.csv.gz")
test_idx = df_train.shape[0]
df_train_gap = df_train.gap
df_test_ids = df_test.Id
df_train = df_train.drop(['gap'], axis=1)
df_test = df_test.drop(['Id'], axis=1)
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
# convert SMILES strings to integer arrays
smiles = list(df_train.smiles) + list(df_test.smiles)
chars2idx, ctr = {}, 0
smile_nums = []
for smile in smiles:
    smile_num = []
    for char in smile:
        if char not in chars2idx:
            chars2idx[char] = ctr
            ctr += 1
        smile_num.append(chars2idx[char])
    smile_nums.append(smile_num)

In [109]:
# print SMILES sizes
lens = [len(a) for a in smile_nums]
print "Min Length: ", min(lens), "\nAvg Length: ", round(np.mean(lens),1), "\nMax Length: ", max(lens)

Min Length:  22 
Avg Length:  50.7 
Max Length:  81


In [110]:
# train-test split
smile_test = smile_nums[test_idx:]
smile_nums = smile_nums[:test_idx]

In [105]:
# generator for data
def batch_generator(batch_samples, num_batches, window_size=20):
    for batch_num in range(num_batches):
        X_batch, y_batch = [], []
        for e,smile_num in enumerate(smile_nums[batch_num*batch_samples:(batch_num+1)*batch_samples]):
            for i in range(len(smile_num)-window_size):
                X_batch.append(smile_num[i:i+window_size])
                y_batch.append(df_train_gap[e])
        X_batch, y_batch = np.array(X_batch), np.array(y_batch)
        X_batch = np_utils.to_categorical(X_batch)
        yield X_batch, y_batch

In [104]:
# define the LSTM model
window_size = 20
num_chars = len(chars2idx.keys())

model = Sequential()
model.add(LSTM(256, input_shape=(window_size, num_chars)))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 256)               284672    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                4112      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 288,801
Trainable params: 288,801
Non-trainable params: 0
_________________________________________________________________


In [106]:
# train
nb_epoch = 1
loss, val_loss = [], []
for e in range(nb_epoch):
    print "\n\nEPOCH: ", e+1, "/", nb_epoch
    for X_batch, y_batch in batch_generator(batch_samples=500, num_batches=5): 
        history = model.fit(X_batch, y_batch, batch_size=32, epochs=1, validation_split=.25)
        loss.append(history.history['loss'])
        val_loss.append(history.history['val_loss'])



EPOCH:  1 / 1
Train on 11469 samples, validate on 3824 samples
Epoch 1/1
Train on 11678 samples, validate on 3893 samples
Epoch 1/1
Train on 11520 samples, validate on 3840 samples
Epoch 1/1
Train on 11502 samples, validate on 3834 samples
Epoch 1/1
Train on 11488 samples, validate on 3830 samples
Epoch 1/1


In [117]:
# test set generator
def test_generator(window_size=20):
    idx = 0
    for smile in enumerate(smile_test):
        dataX = [smile[i:i+window_size] for i in range(len(smile)-window_size)]
        dataX = np_utils.to_categorical(np.array(dataX))
        id_curr = df_test_ids[idx]
        idx += 1
        yield dataX, id_curr

In [None]:
# predict
predictions = [np.median(model.predict(smile)) for smile in test_generator()]

In [None]:
# write to file
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")
            
write_to_file('Predictions/test.csv', predictions)