# Coronavirus

Self-contained LSTM to train using Kaggle.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorboard.plugins.hparams import api as hp

In [None]:
class SmilesTokenizer(object):
    def __init__(self):
        atoms = ['Li', 'Na', 'Al', 'Si', 'Cl', 'Sc', 'Zn', 'As', 'Se', 'Br', 'Sn', 'Te', 'Cn', 'H', 'B', 'C', 'N', 'O', 'F', 'P', 'S', 'K', 'V', 'I', ]
        special = ['(', ')', '[', ']', '=', '#', '%', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '-', 'se', 'te', 'c', 'n', 'o', 's']
        padding = ['G', 'A', 'E']

        self.table = sorted(atoms, key=len, reverse=True) + special + padding
        self.table_len = len(self.table)

        self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table))
        self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table))

        self.one_hot_dict = {}
        for i, symbol in enumerate(self.table):
            vec = np.zeros(self.table_len, dtype=np.float32)
            vec[i] = 1
            self.one_hot_dict[symbol] = vec

    def tokenize(self, smiles):

        smiles = smiles + ' '
        
        N = len(smiles)
        
        token = []
        i = 0
        
        while (i < N):
            c1 = smiles[i]
            c2 = smiles[i : i+2]
            
            if (c2 in self.table_2_chars):
                token.append(c2)
                i = i + 1
                continue
                
            if (c1 in self.table_1_chars):
                token.append(c1)
                i = i + 1
                continue
                
            i = i + 1

        return token

    def one_hot_encode(self, tokenized_smiles):
        result = np.array(
            [self.one_hot_dict[symbol] for symbol in tokenized_smiles],
            dtype=np.float32)
        result = result.reshape(1, result.shape[0], result.shape[1])
        return result

    def embeddings(self, tokenized_smiles):
        result = [self.table.index(symbol) for symbol in tokenized_smiles]
        return result

In [None]:
dataset = pd.read_csv("data/external/dataset.smi", names=["smiles"])
hiv_inhibitors = pd.read_csv("data/external/hiv_inhibitors.smi", names=["smiles"])
hiv_inhibitors = pd.read_csv("data/external/hiv_inhibitors.smi", names=["smiles"])
known_TRPM8_inhibitors = pd.read_csv("data/external/known_TRPM8-inhibitors.smi", names=["smiles"])
manual_testing = pd.read_csv("data/external/manual_testing.smi", names=["smiles"])

df = pd.concat([dataset, hiv_inhibitors, known_TRPM8_inhibitors])

df = df.loc[
    (df['smiles'].str.len() <= 200)
]

display(df)

In [None]:
st = SmilesTokenizer()

encoded_smiles = []

for s in df['smiles']:
    t = st.tokenize(s)
    e = st.embeddings(t)
    encoded_smiles.append(e)

dataset = pad_sequences(encoded_smiles, maxlen=None, dtype='float32', padding='pre', value=0.0)
print(dataset.shape)

In [None]:
st = SmilesTokenizer()
vocab_size = st.table_len

np.random.shuffle(dataset)

X = dataset[:, :-1]
labels = dataset[:, -1:]

VAL_SPLIT = .10

y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

X_train, X_test = X[:int(X.shape[0] * (1 - VAL_SPLIT))], X[int(X.shape[0] * (1 - VAL_SPLIT)):]
y_train, y_test = y[:int(y.shape[0] * (1 - VAL_SPLIT))], y[int(y.shape[0] * (1 - VAL_SPLIT)):]

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]))

max_length = X_train.shape[1]
train_size = X_train.shape[0]

print('Vocabulary size: ', vocab_size)
print('Max length: ', max_length)
print('Train size: ', train_size)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
NUM_EPOCHS = 100

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

weight_init = RandomNormal(mean=0.0, stddev=0.05, seed=71)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128, input_length=max_length),
    tf.keras.layers.LSTM(128, return_sequences=True, kernel_initializer=weight_init, dropout=.1),
    tf.keras.layers.LSTM(128, kernel_initializer=weight_init, dropout=.1),
    tf.keras.layers.Dense(vocab_size, activation="softmax")
])
    
model.compile(loss='categorical_crossentropy', optimizer="nadam", metrics=['mae', 'acc'])

print(model.summary())

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), shuffle=False, epochs=NUM_EPOCHS, batch_size=5000, callbacks=[es])

scores = model.evaluate(X_test, y_test)

print(history)
print(scores)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

model.save('model_nadam_128_100epochs_1000batch.h5')