In [5]:
!pip install rdkit



In [28]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from rdkit import Chem
from tensorflow.keras.callbacks import EarlyStopping

In [13]:
# Load SMILES from a text file
with open('/content/100k_rndm_zinc_drugs_clean.txt', 'r') as file:
    smiles_list = file.read().splitlines()

In [27]:
# Basic preprocessing
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(smiles_list)
total_chars = len(tokenizer.word_index) + 1
max_length = max([len(s) for s in smiles_list])
sequences = tokenizer.texts_to_sequences(smiles_list)
X = pad_sequences(sequences, maxlen=max_length, padding='post')
y = tf.keras.utils.to_categorical(X, num_classes=total_chars)

In [29]:
model = Sequential([
    Embedding(input_dim=total_chars, output_dim=128, input_length=max_length),
    LSTM(256, return_sequences=True),
    LSTM(256, return_sequences=True),
    Dense(total_chars, activation='softmax')
])

In [30]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model

# Implementing early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(X, y, epochs=3, batch_size=64, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/3
Epoch 2/3
Epoch 3/3
 272/1407 [====>.........................] - ETA: 22:18 - loss: 8.1150e-05 - accuracy: 1.0000

In [23]:
def generate_molecule(seed_text, tokenizer, model, max_length, generation_length=100):
    """
    Generate a molecule given a seed text.

    Parameters:
    - seed_text: The initial text to start generating from.
    - tokenizer: The tokenizer used for encoding and decoding texts.
    - model: The trained RNN model.
    - max_length: The maximum length of sequences used during training.
    - generation_length: The maximum length of the molecule to generate.

    Returns:
    A string representing the generated molecule.
    """

    generated_sequence = seed_text
    for _ in range(generation_length - len(seed_text)):
        sequence = tokenizer.texts_to_sequences([generated_sequence])[0]
        padded_sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        prediction = model.predict(padded_sequence, verbose=0)
        # Ensure we're getting a single integer index for the next character
        next_index = np.argmax(prediction[0, -1, :])  # Adjusted indexing here
        next_char = tokenizer.index_word.get(next_index + 1, '')  # Adjust index to match tokenizer's indexing
        if not next_char:
            break
        generated_sequence += next_char
    return generated_sequence



In [26]:
# Example of generating a molecule

generated_smiles = generate_molecule('CC', tokenizer, model, max_length,generation_length=10)
print(generated_smiles)
molecule = Chem.MolFromSmiles(generated_smiles)
if molecule:
    print(f"Valid molecule generated: {generated_smiles}")
else:
    print("Generated SMILES is not a valid molecule.")



CCcccccccc
Generated SMILES is not a valid molecule.


[23:36:36] non-ring atom 2 marked aromatic
