# ðŸ¤  MolGPT Generation - Cowboy Chronicle ðŸ¤ 

This notebook demonstrates how to use the trained MolGPT model to generate novel molecules, either unconditionally or with property/scaffold conditioning.

## 1. Setup Environment

First, let's make sure we have all the necessary imports and set up our environment.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import QED
from rdkit.Chem import Crippen
from rdkit.Chem.Descriptors import ExactMolWt
from rdkit.Chem.rdMolDescriptors import CalcTPSA

# Import directly from the files instead of using package imports
sys.path.insert(0, '.')
from train.model import GPT, GPTConfig
from generate.utils import sample, check_novelty, canonic_smiles
from moses.utils import get_mol

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Load Dataset and Vocabulary

Let's load the dataset and vocabulary for tokenization.

In [None]:
# Load the Moses dataset
data_name = 'moses2'
data = pd.read_csv(f'datasets/{data_name}.csv')
data = data.dropna(axis=0).reset_index(drop=True)
data.columns = data.columns.str.lower()

# Display the first few rows
print(f"Dataset shape: {data.shape}")
data.head()

In [None]:
# Define the regex pattern for tokenizing SMILES
pattern = "(\[[^\]]+]|<|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
regex = re.compile(pattern)

# Define the character set
whole_string = ['#', '%10', '%11', '%12', '(', ')', '-', '1', '2', '3', '4', '5', '6', '7', '8', '9', '<', '=', 'B', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', '[B-]', '[BH-]', '[BH2-]', '[BH3-]', '[B]', '[C+]', '[C-]', '[CH+]', '[CH-]', '[CH2+]', '[CH2]', '[CH]', '[F+]', '[H]', '[I+]', '[IH2]', '[IH]', '[N+]', '[N-]', '[NH+]', '[NH-]', '[NH2+]', '[NH3+]', '[N]', '[O+]', '[O-]', '[OH+]', '[O]', '[P+]', '[PH+]', '[PH2+]', '[PH]', '[S+]', '[S-]', '[SH+]', '[SH]', '[Se+]', '[SeH+]', '[SeH]', '[Se]', '[Si-]', '[SiH-]', '[SiH2]', '[SiH]', '[Si]', '[b-]', '[bH-]', '[c+]', '[c-]', '[cH+]', '[cH-]', '[n+]', '[n-]', '[nH+]', '[nH]', '[o+]', '[s+]', '[sH+]', '[se+]', '[se]', 'b', 'c', 'n', 'o', 'p', 's']

# Create vocabulary mappings
stoi = {ch: i for i, ch in enumerate(whole_string)}
itos = {i: ch for i, ch in enumerate(whole_string)}

# Save vocabulary mappings to JSON files
with open(f'{data_name}_stoi.json', 'w') as f:
    json.dump(stoi, f)

print(f"Vocabulary size: {len(stoi)}")

## 3. Load Pre-trained Model

Now we'll load a pre-trained model for molecule generation.

In [None]:
# Define model parameters
vocab_size = len(stoi)
block_size = 54  # Maximum SMILES length
n_layer = 8
n_head = 8
n_embd = 256
scaffold_max_len = 48  # For Moses dataset

# Choose the model type
model_type = "qed"  # Options: qed, sas, logp, tpsa
model_weight = f"/home/ubuntu/molgpt/datasets/weights/moses_scaf_wholeseq_{model_type}.pt"

# Define model configuration
mconf = GPTConfig(vocab_size, block_size, 
                  num_props=1,  # Using 1 property for conditioning
                  n_layer=n_layer, n_head=n_head, n_embd=n_embd, 
                  scaffold=True, scaffold_maxlen=scaffold_max_len,
                  lstm=False, lstm_layers=0)

# Create the model
model = GPT(mconf)

# Load pre-trained weights
model.load_state_dict(torch.load(model_weight, map_location=device))
model.to(device)
model.eval()

print(f"Model loaded from {model_weight}")

## 4. Unconditional Molecule Generation

Let's start with unconditional molecule generation.

In [None]:
# Set generation parameters
context = "C"  # Starting with a carbon atom
batch_size = 5  # Generate 5 molecules at once
temperature = 1.0  # Temperature for sampling (higher = more diverse)
top_k = None  # No top-k sampling

# Tokenize the context
x = torch.tensor([stoi[s] for s in regex.findall(context)], dtype=torch.long)[None,...].repeat(batch_size, 1).to(device)

# Generate molecules
with torch.no_grad():
    y = sample(model, x, block_size, temperature=temperature, sample=True, top_k=top_k, prop=None, scaffold=None)

# Convert generated tokens to SMILES strings
generated_smiles = []
for gen_mol in y:
    completion = ''.join([itos[int(i)] for i in gen_mol])
    completion = completion.replace('<', '')  # Remove padding tokens
    generated_smiles.append(completion)

# Convert SMILES to molecules
molecules = []
valid_smiles = []
for smiles in generated_smiles:
    mol = get_mol(smiles)
    if mol:
        molecules.append(mol)
        valid_smiles.append(Chem.MolToSmiles(mol))

# Display results
print(f"Generated {len(generated_smiles)} molecules, {len(molecules)} are valid.")
print("\nGenerated SMILES:")
for i, smiles in enumerate(valid_smiles):
    print(f"{i+1}. {smiles}")

In [None]:
# Visualize the generated molecules
if molecules:
    img = Draw.MolsToGridImage(molecules, molsPerRow=3, subImgSize=(300, 300), legends=[f"Mol {i+1}" for i in range(len(molecules))])
    display(img)

## 5. Property-Conditioned Generation

Now let's generate molecules conditioned on specific property values.

In [None]:
# Set property conditioning values
if model_type == "qed":
    prop_values = [0.6, 0.75, 0.9]  # QED values (drug-likeness)
elif model_type == "logp":
    prop_values = [1.0, 2.0, 3.0]  # LogP values (lipophilicity)
elif model_type == "sas":
    prop_values = [2.0, 3.0, 4.0]  # SAS values (synthetic accessibility)
elif model_type == "tpsa":
    prop_values = [40.0, 80.0, 120.0]  # TPSA values (polar surface area)
else:
    prop_values = [0.75]  # Default

# Generate molecules for each property value
all_molecules = []
all_smiles = []
all_props = []

for prop_value in prop_values:
    print(f"\nGenerating molecules with {model_type} = {prop_value}")
    
    # Tokenize the context
    x = torch.tensor([stoi[s] for s in regex.findall(context)], dtype=torch.long)[None,...].repeat(batch_size, 1).to(device)
    
    # Set property conditioning
    p = torch.tensor([[prop_value]]).repeat(batch_size, 1).to(device)
    
    # Generate molecules
    with torch.no_grad():
        y = sample(model, x, block_size, temperature=temperature, sample=True, top_k=top_k, prop=p, scaffold=None)
    
    # Convert generated tokens to SMILES strings
    generated_smiles = []
    for gen_mol in y:
        completion = ''.join([itos[int(i)] for i in gen_mol])
        completion = completion.replace('<', '')  # Remove padding tokens
        generated_smiles.append(completion)
    
    # Convert SMILES to molecules
    molecules = []
    valid_smiles = []
    for smiles in generated_smiles:
        mol = get_mol(smiles)
        if mol:
            molecules.append(mol)
            valid_smiles.append(Chem.MolToSmiles(mol))
    
    # Store results
    all_molecules.extend(molecules)
    all_smiles.extend(valid_smiles)
    all_props.extend([prop_value] * len(molecules))
    
    # Display results
    print(f"Generated {len(generated_smiles)} molecules, {len(molecules)} are valid.")
    for i, smiles in enumerate(valid_smiles[:3]):  # Show only first 3
        print(f"{i+1}. {smiles}")

In [None]:
# Visualize the property-conditioned molecules
if all_molecules:
    # Show only up to 9 molecules
    display_mols = all_molecules[:min(9, len(all_molecules))]
    display_props = all_props[:min(9, len(all_molecules))]
    
    img = Draw.MolsToGridImage(display_mols, molsPerRow=3, subImgSize=(300, 300), 
                              legends=[f"{model_type}={p}" for p in display_props])
    display(img)

## 6. Scaffold-Conditioned Generation

Now let's generate molecules conditioned on specific scaffolds.

In [None]:
# Define scaffold conditions
scaffolds = [
    'c1ccccc1',  # Benzene
    'c1ccncc1',  # Pyridine
    'c1ccccc1N'  # Aniline
]

# Generate molecules for each scaffold
all_scaffold_molecules = []
all_scaffold_smiles = []
all_scaffold_conditions = []

for scaffold in scaffolds:
    print(f"\nGenerating molecules with scaffold: {scaffold}")
    
    # Tokenize the context
    x = torch.tensor([stoi[s] for s in regex.findall(context)], dtype=torch.long)[None,...].repeat(batch_size, 1).to(device)
    
    # Pad the scaffold
    padded_scaffold = scaffold + '<' * (scaffold_max_len - len(regex.findall(scaffold)))
    
    # Tokenize the scaffold
    sca = torch.tensor([stoi[s] for s in regex.findall(padded_scaffold)], dtype=torch.long)[None,...].repeat(batch_size, 1).to(device)
    
    # Generate molecules
    with torch.no_grad():
        y = sample(model, x, block_size, temperature=temperature, sample=True, top_k=top_k, prop=None, scaffold=sca)
    
    # Convert generated tokens to SMILES strings
    generated_smiles = []
    for gen_mol in y:
        completion = ''.join([itos[int(i)] for i in gen_mol])
        completion = completion.replace('<', '')  # Remove padding tokens
        generated_smiles.append(completion)
    
    # Convert SMILES to molecules
    molecules = []
    valid_smiles = []
    for smiles in generated_smiles:
        mol = get_mol(smiles)
        if mol:
            molecules.append(mol)
            valid_smiles.append(Chem.MolToSmiles(mol))
    
    # Store results
    all_scaffold_molecules.extend(molecules)
    all_scaffold_smiles.extend(valid_smiles)
    all_scaffold_conditions.extend([scaffold] * len(molecules))
    
    # Display results
    print(f"Generated {len(generated_smiles)} molecules, {len(molecules)} are valid.")
    for i, smiles in enumerate(valid_smiles[:3]):  # Show only first 3
        print(f"{i+1}. {smiles}")

In [None]:
# Visualize the scaffold-conditioned molecules
if all_scaffold_molecules:
    # Show only up to 9 molecules
    display_mols = all_scaffold_molecules[:min(9, len(all_scaffold_molecules))]
    display_scaffolds = all_scaffold_conditions[:min(9, len(all_scaffold_molecules))]
    
    img = Draw.MolsToGridImage(display_mols, molsPerRow=3, subImgSize=(300, 300), 
                              legends=[f"Scaffold: {s}" for s in display_scaffolds])
    display(img)

## 7. Summary

In this notebook, we've demonstrated how to use MolGPT for molecular generation with different conditioning strategies:

1. **Unconditional Generation**: Generate molecules without any constraints
2. **Property-Conditioned Generation**: Generate molecules with specific property values (QED, LogP, SAS, TPSA)
3. **Scaffold-Conditioned Generation**: Generate molecules containing specific molecular scaffolds

The MolGPT model provides a powerful and flexible approach to molecular generation, allowing for precise control over the generated structures through various conditioning mechanisms.