In [None]:
import os
import tqdm
import pandas as pd
import networkx as nx
from rdkit import Chem

In [None]:
df_raw = pd.read_csv('zinc15_canonical_to_bcl2family.csv')

# Filtering by the number of rings

In [None]:
df_raw = df_raw.drop_duplicates(subset=('smiles',), ignore_index=True)

In [None]:
def GetNumRings(smi):
    '''
    Reference: https://github.com/wengong-jin/iclr19-graph2graph/blob/master/props/properties.py
    '''
    mol = Chem.MolFromSmiles(smi)
    cycle_list = nx.cycle_basis(nx.Graph(Chem.rdmolops.GetAdjacencyMatrix(mol)))
    return len(cycle_list)

In [None]:
cnt_rings = []

for smi in tqdm.tqdm(df_raw.loc[:,'smiles'].values, total=len(df_raw)):
    cnt_rings.append(GetNumRings(smi))

df_raw.loc[:,'cnt_ring'] = cnt_rings

In [None]:
df_filtered = df_raw[df_raw['cnt_ring'] < 10].reset_index()

In [None]:
print(f"Total of molecules: {len(df_filtered)}")
df_filtered

# Test data: three Bcl-2 family inhibitors

In [None]:
def get_canonical(smi):
    mol = Chem.MolFromSmiles(smi)
    return Chem.MolToSmiles(mol, isomericSmiles=False)

In [None]:
## PubChem 24978538
smi_navitoclax = 'CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(=O)(=O)C4=CC(=C(C=C4)NC(CCN5CCOCC5)CSC6=CC=CC=C6)S(=O)(=O)C(F)(F)F)C7=CC=C(C=C7)Cl)C'
smi_navitoclax = get_canonical(smi_navitoclax)

## PubChem 11228183
smi_abt737 = 'CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O)NC(=O)C3=CC=C(C=C3)N4CCN(CC4)CC5=CC=CC=C5C6=CC=C(C=C6)Cl)[N+](=O)[O-]'
smi_abt737 = get_canonical(smi_abt737)

## PubChem 49846579
smi_venetoclax = 'CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC(=C(C=C4)C(=O)NS(=O)(=O)C5=CC(=C(C=C5)NCC6CCOCC6)[N+](=O)[O-])OC7=CN=C8C(=C7)C=CN8)C'
smi_venetoclax = get_canonical(smi_venetoclax)

In [None]:
smi_navitoclax in df_filtered['smiles'].values

In [None]:
smi_abt737 in df_filtered['smiles'].values

In [None]:
smi_venetoclax in df_filtered['smiles'].values

# Spliting into Training and Test

In [None]:
idx_test = df_filtered['smiles'].map(lambda x:x in {smi_navitoclax, smi_abt737, smi_venetoclax})
print(idx_test.value_counts())

In [None]:
df_test = df_filtered[idx_test].reset_index(drop=True).copy()
df_test

In [None]:
df_train = df_filtered[~idx_test].reset_index(drop=True).copy()
df_train

# Vocabulary check

In [None]:
vocab_tr = set()
for smi in df_train.loc[:,"smiles"]:
    vocab_tr = vocab_tr.union(set(smi))
print(len(vocab_tr))
print(sorted(list(vocab_tr)))

In [None]:
vocab_te = set()
for smi in df_test.loc[:,"smiles"]:
    vocab_te = vocab_te.union(set(smi))
print(len(vocab_te))
print(sorted(list(vocab_te)))

In [None]:
vocab_te.issubset(vocab_tr)

# Save

In [None]:
df_train.to_csv("zinc15_train_full.csv", index=False)
df_train.loc[:,"smiles"].to_csv("zinc15_train.txt", index=False, header=None)

In [None]:
df_test.to_csv("zinc15_test_full.csv", index=False)
df_test.loc[:,"smiles"].to_csv("zinc15_test.txt", index=False, header=None)