In [1]:
import numpy as np
import pandas as pd
import boto3
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
client = boto3.client('s3')

bucket = 'smilesmolecules'

In [3]:
def get_files(client, bucket):
    length = 0
    csv_files = []
    content = client.list_objects(Bucket=bucket).get('Contents')
    for obj in content:
        if length > 50:
            break
        key = obj.get('Key')
        
        if '.smi' in key:
                csv_files.append(key)
                length += 1 
    return csv_files

test = get_files(client, bucket)
len(test)

51

In [4]:
'''
The whole S3 bucket contains more than 900 million molecules. We'll grab around 10 million as a starter.
'''

df = []

split1 = test[:50]

for obj in split1:
    dataframe = pd.read_csv('s3://smilesmolecules/' + obj, delimiter = ' ')
    df.append(dataframe)

    
df1 = pd.concat(df)

In [5]:
df1 = df1.iloc[:1000000, 0]
molecules = df1.tolist()

In [6]:
'''
Need to map each character to an int
'''
unique_chars = sorted(list(set(''.join(molecules[:500000]))))
print('Number of unique characters in the molecules set:', len(unique_chars))

('Number of unique characters in the molecules set:', 34)


In [7]:
char_to_int = {c: i for i, c in enumerate(unique_chars)}

In [8]:
text = ''.join(molecules[:500])

In [9]:
X = np.array([])
y = np.array([])
for i in range(0, len(text) - 300):
    seq_X = text[i:i + 300]
    seq_y = text[i + 300]
    X = np.append(X, [char_to_int[char] for char in seq_X])
    y = np.append(y, [char_to_int[char] for char in seq_y])

In [10]:
y

array([ 1., 16., 21., ..., 26.,  6., 21.])

In [11]:
X = np.reshape(X, (-1, 300, 1))

X = X / len(unique_chars)

y = np_utils.to_categorical(y)

print(X.shape)
print(y.shape)

(18902, 300, 1)
(18902, 34)


In [12]:
'''Create the LSTM Model'''

model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences= True))
model.add(Dropout(0.25))

model.add(LSTM(256, return_sequences=True))
model.add(Dropout(.25))

model.add(LSTM(512, return_sequences = True))
model.add(Dropout(0.20))

model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.20))

model.add(LSTM(128))
model.add(Dropout(0.20))

model.add(Dense(y.shape[1], activation = 'softmax'))

In [13]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 300, 128)          66560     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 300, 256)          394240    
_________________________________________________________________
dropout_2 (Dropout)          (None, 300, 256)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 300, 512)          1574912   
_________________________________________________________________
dropout_3 (Dropout)          (None, 300, 512)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 300, 256)         

In [14]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
#Checkpoint
filepath = 'improved-weights-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only= True, mode = 'min')
callback = [checkpoint]

#Fitting
model.fit(X, y, epochs = 6, batch_size= 512, callbacks = callback)

Epoch 1/6

Epoch 00001: loss improved from inf to 2.69083, saving model to improved-weights-01-2.6908.hdf5
Epoch 2/6

Epoch 00002: loss improved from 2.69083 to 2.57679, saving model to improved-weights-02-2.5768.hdf5
Epoch 3/6

Epoch 00003: loss improved from 2.57679 to 2.56428, saving model to improved-weights-03-2.5643.hdf5
Epoch 4/6

Epoch 00004: loss improved from 2.56428 to 2.55076, saving model to improved-weights-04-2.5508.hdf5
Epoch 5/6

Epoch 00005: loss improved from 2.55076 to 2.54737, saving model to improved-weights-05-2.5474.hdf5
Epoch 6/6