## Structure-constrained Molecular generation using COMA

## 1. Import libraries

In [1]:
import os
import pandas as pd
import time
import tqdm
import torch
from torch.utils.data import DataLoader
from rdkit.Chem.rdmolfiles import MolFromSmiles

In [2]:
from coma.dataset import ValidationSmilesDataset
from coma.vae import SmilesAutoencoder
from coma.properties import drd2, qed, penalized_logp, similarity

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


## 2. Configure GPU (if available)

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

cuda:0


## 3. Specify a target property

In [4]:
PROPERTY_NAME = "drd2"
SCORING_FT = drd2

#PROPERTY_NAME = "qed"
#SCORING_FT = qed

#PROPERTY_NAME = "logp04"
#SCORING_FT = penalized_logp

#PROPERTY_NAME = "logp06"
#SCORING_FT = penalized_logp

## 4. Describe file information for inputs

In [5]:
input_data_dir = os.path.abspath(os.path.join(os.pardir, "data", PROPERTY_NAME))
input_ckpt_dir = f"outputs_2_finetuning_{PROPERTY_NAME}"

In [6]:
filepath_test = os.path.join(input_data_dir, "rdkit_test.txt")

In [7]:
filepath_pretrain_ckpt     = os.path.join(input_ckpt_dir, "checkpoints.pt")
filepath_pretrain_configs  = os.path.join(input_ckpt_dir, "configs.csv")
filepath_pretrain_char2idx = os.path.join(input_ckpt_dir, "char2idx.csv")

## 5. Describe file information for outputs

In [8]:
output_dir = f"outputs_4_generation_{PROPERTY_NAME}"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [9]:
filepath_output = os.path.join(output_dir, f"COMA_{PROPERTY_NAME}.csv")

## 6. Load the test dataset

In [10]:
dataset_test = ValidationSmilesDataset(filepath_test, filepath_pretrain_char2idx, device=device)

## 7. Load a pretrained generator of COMA

In [11]:
## Model configuration
model_configs = {"hidden_size"    :None,
                 "latent_size"    :None,
                 "num_layers"     :None,
                 "vocab_size"     :None,
                 "sos_idx"        :None,
                 "eos_idx"        :None,
                 "pad_idx"        :None,
                 "device"         :device,
                 "filepath_config":filepath_pretrain_configs}

## Model initialization
generator = SmilesAutoencoder(**model_configs)

## Load pretrained model
generator.load_model(filepath_pretrain_ckpt)

## 8. Start to generate

In [12]:
K = 20 # repetition count of translation

generated = [] # initialize a list of outputs

for batch in tqdm.tqdm(DataLoader(dataset_test, batch_size=1, shuffle=False, drop_last=False, pin_memory=use_cuda)):
    batch_smiles = dataset_test.encode(batch["smiles_s"], batch["length_s"].max())
    batch_length = batch["length_s"]
    ## translation
    for _ in range(K):
        seq = generator.predict(batch_smiles, batch_length)
        smi = dataset_test.decode(seq)[0] # assumption: batch_size=1
        if MolFromSmiles(smi) is not None:
            generated.append((batch["smiles_s"][0][1:-1], smi))
        else:
            generated.append((batch["smiles_s"][0][1:-1], "None"))
        
df_generated = pd.DataFrame.from_records(generated)

100%|██████████| 1000/1000 [06:36<00:00,  2.52it/s]


## 9. Evaluate the generated molecules

In [13]:
scores = []
for smi_src, smi_tar in tqdm.tqdm(df_generated.values):
    if smi_tar == "None": smi_tar=None
    sim2D = similarity(smi_src, smi_tar)
    try:
        scores.append((smi_src, smi_tar, sim2D, SCORING_FT(smi_tar)))
    except Exception as e:
        scores.append((smi_src, smi_tar, sim2D, 0.))

100%|██████████| 20000/20000 [01:35<00:00, 209.55it/s]


In [14]:
df_scores = pd.DataFrame.from_records(scores)
print(df_scores.shape)
df_scores.head()

(20000, 4)


Unnamed: 0,0,1,2,3
0,N#CC1=CC=CC=C1COC1=CC=CC(C(=O)N2CCN(C3=CC=C(Br...,N#CC1=CC=CC(CN2CCN(C3=CC=CC(C(F)(F)F)=C3)CC2)=C1,0.195122,0.902995
1,N#CC1=CC=CC=C1COC1=CC=CC(C(=O)N2CCN(C3=CC=C(Br...,N#CC1=CC=CC(CN2CCN(C3=CC=CC=N3)CC2)C=C1,0.246914,0.16025
2,N#CC1=CC=CC=C1COC1=CC=CC(C(=O)N2CCN(C3=CC=C(Br...,N#CC1=CC=CC(CN2CCN(C3=CC=CC(C(F)(F)F)=C3)CC2)=C1,0.195122,0.902995
3,N#CC1=CC=CC=C1COC1=CC=CC(C(=O)N2CCN(C3=CC=C(Br...,N#CC1=CC=CC(C(O)CN2CCN(C3=CC=CC=C3)CC2)=C1,0.230769,0.383795
4,N#CC1=CC=CC=C1COC1=CC=CC(C(=O)N2CCN(C3=CC=C(Br...,N#CC1=CC=CC=C1N1CCN(CCCCN2CCN(C3=CC=CC(C(F)(F)...,0.265823,0.989971


## 10. Save the results

In [15]:
df_scores.to_csv(filepath_output, header=None, index=False)