In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [33]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from rdkit import Chem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [34]:
#Step 2: Data Preparation
# Dummy function and variable for illustration
def preprocess_smiles(smiles_list):
    # This function should tokenize and encode SMILES strings
    # Step 1 & 2: Tokenize SMILES and build character index
    tokenizer = Tokenizer(char_level=True)  # char_level=True tokenizes at the character level
    tokenizer.fit_on_texts(smiles_list['SMILE_Organic_linker_1'])

    # Step 3: Encode SMILES strings
    sequences = tokenizer.texts_to_sequences(smiles_list['SMILE_Organic_linker_1'])

    # Step 4: Padding
    max_len = max(len(s) for s in sequences)  # You might choose to set this manually
    encoded_smiles = pad_sequences(sequences, maxlen=max_len, padding='post')
    # For simplicity, this is just a placeholder
    return np.array(encoded_smiles),max_len

# Example SMILES data (normally you would load this from a file)
smiles_data = pd.read_csv("/content/smiles.csv")

max_smiles_length = max(len(s) for s in smiles_data)
encoded_smiles,max_len = preprocess_smiles(smiles_data)


In [35]:
#Step 3: Define GAN Architecture
def build_generator(latent_dim, output_dim):
    model = tf.keras.Sequential([
        Dense(128, activation='relu', input_dim=latent_dim),
        Reshape((32, 4)),
        LSTM(64, return_sequences=True),
        Flatten(),
        Dense(output_dim, activation='sigmoid')
    ])
    return model

def build_discriminator(input_dim):
    model = tf.keras.Sequential([
        LSTM(64, input_shape=(input_dim, 1), return_sequences=True),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    return model

latent_dim = 10
generator = build_generator(latent_dim, max_smiles_length)
discriminator = build_discriminator(max_smiles_length)


In [36]:
#Step 4: Compile GAN
# Compile discriminator
discriminator.compile(optimizer=Adam(0.0002, 0.5), loss='binary_crossentropy', metrics=['accuracy'])

# Combined model (stacked generator and discriminator)
# The generator takes noise as input and generates sequences
z = Input(shape=(latent_dim,))
smiles = generator(z)

# For the combined model we will only train the generator
discriminator.trainable = False

# The discriminator takes generated images as input and determines validity
validity = discriminator(smiles)

# The combined model  (stacked generator and discriminator)
combined = Model(z, validity)
combined.compile(optimizer=Adam(0.0002, 0.5), loss='binary_crossentropy')


In [None]:
#Step 5: Training Loop
import numpy as np

epochs = 10
batch_size = 10
sample_interval = 20

# Ground truths for real and fake images
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

for epoch in range(epochs):

    # ---------------------
    #  Train Discriminator
    # ---------------------

    # Select a random half of images
    idx = np.random.randint(0, encoded_smiles.shape[0], batch_size)
    real_smiles = encoded_smiles[idx]

    # Sample noise and generate a batch of new images
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    gen_smiles = generator.predict(noise)

    # Train the discriminator (real classified as ones and generated as zeros)
    d_loss_real = discriminator.train_on_batch(real_smiles, valid)
    d_loss_fake = discriminator.train_on_batch(gen_smiles, fake)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # ---------------------
    #  Train Generator
    # ---------------------

    # Train the generator (wants discriminator to mistake images as real)
    g_loss = combined.train_on_batch(noise, valid)

    # Plot the progress
    print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}] [G loss: {g_loss}]")

    # If at save interval => save generated image samples
    if epoch % sample_interval == 0:
        # Here you can save or visualize your generated SMILES for inspection
        pass


In [None]:
#Step 6: Generate and Validate SMILES
def generate_smiles(generator, latent_dim):
    noise = np.random.normal(0, 1, (1, latent_dim))
    gen_smiles = generator.predict(noise)
    # Here, you need to decode the generated SMILES from the numerical format back to string
    # This step is highly dependent on how you encoded the SMILES strings initially
    decoded_smiles = 'CCO'  # Placeholder for actual decoding logic
    return decoded_smiles

generated_smiles = generate_smiles(generator, latent_dim)
print("Generated SMILES:", generated_smiles)

# Validate with RDKit
mol = Chem.MolFromSmiles(generated_smiles)
if mol:
    print("Valid SMILES")
    Draw.MolToImage(mol)
else:
    print("Invalid SMILES")
