In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [3]:
PROPERTY_NAME = "logp04"

df_train_pairs = pd.read_csv(os.path.join(PROPERTY_NAME, "rdkit_train_pairs.txt"), sep=" ", header=None); print(df_train_pairs.shape)
list_smi_src = df_train_pairs.iloc[:,0].values.tolist()
list_smi_tar = df_train_pairs.iloc[:,1].values.tolist()

df_valid = pd.read_csv(os.path.join(PROPERTY_NAME, "rdkit_valid.txt"), header=None); print(df_valid.shape)
list_smi_valid = df_valid.iloc[:,0].values.tolist()

df_test = pd.read_csv(os.path.join(PROPERTY_NAME, "rdkit_test.txt"), header=None); print(df_test.shape)
list_smi_test = df_test.iloc[:,0].values.tolist()

(98690, 2)
(200, 1)
(800, 1)


In [4]:
#df_chembl = pd.read_csv(os.path.join("chembl", "preprocessed_chembl_29_chemreps.csv")); print(df_chembl.shape)
#list_smi_chembl = df_chembl.iloc[:,0].values.tolist()

In [5]:
#list_chembl_unique = list(set(list_smi_chembl) - set(list_smi_src).union(set(list_smi_tar)).union(set(list_smi_valid)).union(set(list_smi_test)))
#print(len(list_chembl_unique))

In [6]:
list_chembl_unique = list(set(list_smi_src + list_smi_tar))
print(len(list_chembl_unique))

99066


In [7]:
list_triplet = []
K = 20

for i, (smi_src, smi_tar) in tqdm(enumerate(zip(list_smi_src, list_smi_tar)), total=len(list_smi_src)):
    batch_list_triplet = []
    ## mol
    mol_src = Chem.MolFromSmiles(smi_src)
    mol_tar = Chem.MolFromSmiles(smi_tar)
    ## fingerprint
    fp_src = AllChem.GetMorganFingerprintAsBitVect(mol_src, radius=2, nBits=2048, useChirality=False)
    fp_tar = AllChem.GetMorganFingerprintAsBitVect(mol_tar, radius=2, nBits=2048, useChirality=False)
    
    ## Shuffle
    random.shuffle(list_chembl_unique)
    
    for smi_chembl in list_chembl_unique:
        mol_chembl = Chem.MolFromSmiles(smi_chembl)
        fp_chembl = AllChem.GetMorganFingerprintAsBitVect(mol_chembl, radius=2, nBits=2048, useChirality=False)
        ## Tanimoto
        sim_src = DataStructs.TanimotoSimilarity(fp_src, fp_chembl)
        sim_tar = DataStructs.TanimotoSimilarity(fp_tar, fp_chembl)
        ## check
        if sim_src < 0.3 and sim_tar < 0.3:
            batch_list_triplet.append((smi_src, smi_tar, smi_chembl))
        ## stop
        if len(batch_list_triplet) == K:
            break
            
    if len(batch_list_triplet) < K:
        print(f"[WARNING] {i} has insufficient data ({len(batch_list_triplet)} < {K})")
        
    list_triplet.extend(batch_list_triplet)

100%|██████████| 98690/98690 [1:06:38<00:00, 24.68it/s]


In [8]:
df_triplet = pd.DataFrame(list_triplet)

df_triplet.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_triplet.txt"), sep=" ", header=None, index=False)