In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [None]:
pd.read.csv

In [3]:
# load data
df_train = pd.read_csv("train.csv.gz")
df_test = pd.read_csv("test.csv.gz")
test_idx = df_train.shape[0]
df_train_gap = df_train.gap
df_test_ids = df_test.Id
df_train = df_train.drop(['gap'], axis=1)
df_test = df_test.drop(['Id'], axis=1)
df_train.head()

KeyboardInterrupt: 

In [None]:
# convert SMILES strings to integer arrays
smiles = list(df_train.smiles) + list(df_test.smiles)
chars2idx, ctr = {}, 0
smile_nums = []
for smile in smiles:
    smile_num = []
    for char in smile:
        if char not in chars2idx:
            chars2idx[char] = ctr
            ctr += 1
        smile_num.append(chars2idx[char])
    smile_nums.append(smile_num)

In [None]:
# print SMILES sizes
lens = [len(a) for a in smile_nums]
print "Min Length: ", min(lens), "\nAvg Length: ", round(np.mean(lens),1), "\nMax Length: ", max(lens)

In [110]:
# train-test split
smile_test = smile_nums[test_idx:]
smile_nums = smile_nums[:test_idx]

In [None]:
# generator for data
def batch_generator(batch_samples, num_batches, window_size=20):
    for batch_num in range(num_batches):
        X_batch, y_batch = [], []
        for e,smile_num in enumerate(smile_nums[batch_num*batch_samples:(batch_num+1)*batch_samples]):
            for i in range(len(smile_num)-window_size):
                X_batch.append(smile_num[i:i+window_size])
                y_batch.append(df_train_gap[e])
        X_batch, y_batch = np.array(X_batch), np.array(y_batch)
        X_batch = np_utils.to_categorical(X_batch)
        yield X_batch, y_batch

In [None]:
# define the LSTM model
window_size = 20
num_chars = len(chars2idx.keys())

model = Sequential()
model.add(LSTM(256, input_shape=(window_size, num_chars)))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()

In [None]:
# train
nb_epoch = 1
loss, val_loss = [], []
for e in range(nb_epoch):
    print "\n\nEPOCH: ", e+1, "/", nb_epoch
    for X_batch, y_batch in batch_generator(batch_samples=500, num_batches=5): 
        history = model.fit(X_batch, y_batch, batch_size=32, epochs=1, validation_split=.25)
        loss.append(history.history['loss'])
        val_loss.append(history.history['val_loss'])

In [None]:
# test set generator
def test_generator(window_size=20):
    idx = 0
    for smile in enumerate(smile_test):
        dataX = [smile[i:i+window_size] for i in range(len(smile)-window_size)]
        dataX = np_utils.to_categorical(np.array(dataX))
        id_curr = df_test_ids[idx]
        idx += 1
        yield dataX, id_curr

In [None]:
# predict
predictions = [np.median(model.predict(smile)) for smile in test_generator()]

In [None]:
# write to file
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")
            
write_to_file('Predictions/test.csv', predictions)