# Generate

Load trained weights, synthesize new sequences, save SMILES to a plain text file.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorboard.plugins.hparams import api as hp

from rdkit import Chem

from src.features.smiles import SmilesTokenizer, cleanup_list_smiles, encode_list_smiles
from src.models.lstm_model import build_model

## Build model and load trained weights

In [None]:
st = SmilesTokenizer()
vocab_size = st.table_len

model = build_model(vocab_size, 128, .1, 'nadam')

print(model.summary())

model.load_weights('models/model_nadam_128_100epochs_1000batch.h5')

## Load, process dataset

In [None]:
dataset = np.load('data/interim/smiles_train.npy')
print(dataset.shape)

In [None]:
X = dataset[:, :-1]
labels = dataset[:, -1:]
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)
y_pred = model.predict(X)

In [None]:
X = X.astype('int')
y = tf.argmax(y, axis=1).numpy().astype('int')
y_pred = tf.argmax(y_pred, axis=1).numpy().astype('int')

print(X.shape)
print(labels.shape)
print(y.shape)
print(y_pred.shape)

## Synthesize sequences

In [None]:
smiles = []
for i in range(len(X)):
    s = st.embeddings_to_smiles(X[i]) + st.embeddings_to_smiles(y[i])
    smiles.append(s)

for i in range(len(X)):
    s = st.embeddings_to_smiles(X[i]) + st.embeddings_to_smiles(y_pred[i])
    smiles.append(s)

print("# SMILES:", len(smiles))

smiles = list(set(smiles))

print("# SMILES (de-duplicated):")

valid_smiles = cleanup_list_smiles(smiles)

print("# SMILES (validated):", valid_smiles)

print('Generated:', len(smiles))
print('Valid:', len(valid_smiles))
        
with open('data/processed/valid_smiles.smi', 'w') as f:
    for s in valid_smiles:
        f.write("%s\n" % s)