In [1]:
import os
import tqdm
import pandas as pd
import networkx as nx
from rdkit import Chem

In [2]:
df_raw = pd.read_csv('zinc15_canonical_to_bcl2family.csv')

# Filtering by the number of rings

In [3]:
df_raw = df_raw.drop_duplicates(subset=('smiles',), ignore_index=True)

In [4]:
def GetNumRings(smi):
    '''
    Reference: https://github.com/wengong-jin/iclr19-graph2graph/blob/master/props/properties.py
    '''
    mol = Chem.MolFromSmiles(smi)
    cycle_list = nx.cycle_basis(nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol)))
    return len(cycle_list)

In [5]:
cnt_rings = []

for smi in tqdm.tqdm(df_raw.loc[:,'smiles'].values, total=len(df_raw)):
    cnt_rings.append(GetNumRings(smi))

df_raw.loc[:,'cnt_ring'] = cnt_rings

100%|██████████| 600778/600778 [05:00<00:00, 1997.28it/s]


In [6]:
df_filtered = df_raw[df_raw['cnt_ring'] < 10].reset_index()

In [7]:
print(f"Total of molecules: {len(df_filtered)}")
df_filtered

Total of molecules: 600119


Unnamed: 0,index,zinc_id,smiles,mwt,logp,length,ba_bcl2,ba_bclxl,ba_bclw,cnt_ring
0,0,100294498,CCCCCCc1cn(C2CC(O)C(COP(=O)(O)OP(=O)(O)OP(=O)(...,552.303,0.651,66,6.027321,5.739085,5.318719,2
1,1,267331246,CC(C)=CC1COC23CC4(CO2)C(CCC2C5(C)CCC(OC6OCC(O)...,899.081,0.463,112,5.350236,4.991594,4.677246,9
2,2,223705858,CC(=O)OCC1(C(=O)O)C(CC(=O)O)=C(C(=O)OC2=CCOC=C...,538.414,0.210,77,4.933928,4.714845,5.049777,2
3,3,15721567,CN(CCc1ccccc1)C(=O)CNC(=O)C(CCS(C)=O)NC(=O)C(N...,502.637,0.333,59,5.222250,5.273129,4.764635,2
4,4,575417981,CC(=O)NC1C(OC2C(COC(C)=O)OC(Oc3ccc4c(C)cc(=O)o...,792.744,0.246,102,4.905717,4.914623,4.521364,4
...,...,...,...,...,...,...,...,...,...,...
600114,600773,409268530,COc1cc(C=C2SC(=S)N(Cc3ccco3)C2=O)ccc1OCC(=O)Nc...,529.039,5.669,60,5.851861,5.564353,5.456973,4
600115,600774,409344691,CCOC(=O)COc1c(Br)cc(C=C2SC(=S)N(c3cccc(SC)c3)C...,554.509,5.527,55,6.331471,6.066432,5.528964,3
600116,600775,409352639,COc1cc(C=C2SC(=S)N(c3cccc(C(F)(F)F)c3)C2=O)cc(...,504.349,5.891,53,6.018271,5.518678,5.613740,3
600117,600776,409369685,COc1cc(C=C2SC(=S)N(NC(=O)Nc3ccc(Cl)c(Cl)c3)C2=...,636.451,6.549,87,6.204907,6.185677,6.270988,4


# Test data: three Bcl-2 family inhibitors

In [8]:
def get_canonical(smi):
    mol = Chem.MolFromSmiles(smi)
    return Chem.MolToSmiles(mol, isomericSmiles=False)

In [9]:
## PubChem 24978538
smi_navitoclax = 'CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(=O)(=O)C4=CC(=C(C=C4)NC(CCN5CCOCC5)CSC6=CC=CC=C6)S(=O)(=O)C(F)(F)F)C7=CC=C(C=C7)Cl)C'
smi_navitoclax = get_canonical(smi_navitoclax)

## PubChem 11228183
smi_abt737 = 'CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O)NC(=O)C3=CC=C(C=C3)N4CCN(CC4)CC5=CC=CC=C5C6=CC=C(C=C6)Cl)[N+](=O)[O-]'
smi_abt737 = get_canonical(smi_abt737)

## PubChem 49846579
smi_venetoclax = 'CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC(=C(C=C4)C(=O)NS(=O)(=O)C5=CC(=C(C=C5)NCC6CCOCC6)[N+](=O)[O-])OC7=CN=C8C(=C7)C=CN8)C'
smi_venetoclax = get_canonical(smi_venetoclax)

In [10]:
smi_navitoclax in df_filtered['smiles'].values

True

In [11]:
smi_abt737 in df_filtered['smiles'].values

True

In [12]:
smi_venetoclax in df_filtered['smiles'].values

True

# Spliting into Training and Test

In [13]:
idx_test = df_filtered['smiles'].map(lambda x:x in {smi_navitoclax, smi_abt737, smi_venetoclax})
print(idx_test.value_counts())

False    600116
True          3
Name: smiles, dtype: int64


In [14]:
df_test = df_filtered[idx_test].reset_index(drop=True).copy()
df_test

Unnamed: 0,index,zinc_id,smiles,mwt,logp,length,ba_bcl2,ba_bclxl,ba_bclw,cnt_ring
0,168705,150338726,CC1(C)CCC(c2ccc(Cl)cc2)=C(CN2CCN(c3ccc(C(=O)NS...,974.634,8.833,118,9.745277,7.524013,6.596804,7
1,456720,150368814,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,813.446,7.881,100,8.391831,9.042492,7.400223,6
2,457667,150338755,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,868.457,8.66,118,10.44794,8.814969,6.717071,8


In [15]:
df_train = df_filtered[~idx_test].reset_index(drop=True).copy()
df_train

Unnamed: 0,index,zinc_id,smiles,mwt,logp,length,ba_bcl2,ba_bclxl,ba_bclw,cnt_ring
0,0,100294498,CCCCCCc1cn(C2CC(O)C(COP(=O)(O)OP(=O)(O)OP(=O)(...,552.303,0.651,66,6.027321,5.739085,5.318719,2
1,1,267331246,CC(C)=CC1COC23CC4(CO2)C(CCC2C5(C)CCC(OC6OCC(O)...,899.081,0.463,112,5.350236,4.991594,4.677246,9
2,2,223705858,CC(=O)OCC1(C(=O)O)C(CC(=O)O)=C(C(=O)OC2=CCOC=C...,538.414,0.210,77,4.933928,4.714845,5.049777,2
3,3,15721567,CN(CCc1ccccc1)C(=O)CNC(=O)C(CCS(C)=O)NC(=O)C(N...,502.637,0.333,59,5.222250,5.273129,4.764635,2
4,4,575417981,CC(=O)NC1C(OC2C(COC(C)=O)OC(Oc3ccc4c(C)cc(=O)o...,792.744,0.246,102,4.905717,4.914623,4.521364,4
...,...,...,...,...,...,...,...,...,...,...
600111,600773,409268530,COc1cc(C=C2SC(=S)N(Cc3ccco3)C2=O)ccc1OCC(=O)Nc...,529.039,5.669,60,5.851861,5.564353,5.456973,4
600112,600774,409344691,CCOC(=O)COc1c(Br)cc(C=C2SC(=S)N(c3cccc(SC)c3)C...,554.509,5.527,55,6.331471,6.066432,5.528964,3
600113,600775,409352639,COc1cc(C=C2SC(=S)N(c3cccc(C(F)(F)F)c3)C2=O)cc(...,504.349,5.891,53,6.018271,5.518678,5.613740,3
600114,600776,409369685,COc1cc(C=C2SC(=S)N(NC(=O)Nc3ccc(Cl)c(Cl)c3)C2=...,636.451,6.549,87,6.204907,6.185677,6.270988,4


# Vocabulary check

In [16]:
vocab_tr = set()
for smi in df_train.loc[:,"smiles"]:
    vocab_tr = vocab_tr.union(set(smi))
print(len(vocab_tr))
print(sorted(list(vocab_tr)))

32
['#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', 'B', 'C', 'F', 'H', 'N', 'O', 'P', 'S', '[', ']', 'c', 'l', 'n', 'o', 'p', 'r', 's']


In [17]:
vocab_te = set()
for smi in df_test.loc[:,"smiles"]:
    vocab_te = vocab_te.union(set(smi))
print(len(vocab_te))
print(sorted(list(vocab_te)))

21
['(', ')', '+', '-', '1', '2', '3', '4', '5', '=', 'C', 'F', 'H', 'N', 'O', 'S', '[', ']', 'c', 'l', 'n']


In [18]:
vocab_te.issubset(vocab_tr)

True

# Save

In [19]:
df_train.to_csv("zinc15_train_full.csv", index=False)
df_train.loc[:,"smiles"].to_csv("zinc15_train.txt", index=False, header=None)

In [20]:
df_test.to_csv("zinc15_test_full.csv", index=False)
df_test.loc[:,"smiles"].to_csv("zinc15_test.txt", index=False, header=None)