In [1]:
import os
import tqdm
import pandas as pd

In [2]:
df_raw = pd.read_csv('chembl_gsk3_jnk3_qed_sa.csv')

# Filtering by the number of rings

In [7]:
df_raw = df_raw.drop_duplicates(subset=('smiles',), ignore_index=True)
print(len(df_raw))

1480293


In [8]:
df_filtered = df_raw[df_raw['num_rings'] < 10].reset_index(drop=True)

In [9]:
print(f"Total of molecules: {len(df_filtered)}")
df_filtered

Total of molecules: 1480102


Unnamed: 0,smiles,length,gsk3,jnk3,sa,qed,num_atoms,num_rings
0,c1cc(OCCCN2CCCCC2)ccc1CN1CCC2(CC1)OCCO2,39,0.01,0.00,2.692034,0.684442,27.0,4.0
1,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,29,0.06,0.00,3.047999,0.842972,19.0,3.0
2,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,60,0.24,0.19,3.263126,0.439582,34.0,5.0
3,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,46,0.09,0.12,2.117217,0.552710,27.0,4.0
4,Cn1c(=O)c2c(SCC(=O)N3CCOCC3)nc(-c3ccccc3F)nc2n...,53,0.00,0.01,2.530062,0.432354,31.0,4.0
...,...,...,...,...,...,...,...,...
1480097,CNCCOc1ccc2cc3ccc(OCCNC)cc3nc2c1,32,0.01,0.01,2.260887,0.492394,24.0,3.0
1480098,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,45,0.12,0.00,2.810497,0.626775,25.0,4.0
1480099,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,47,0.04,0.00,2.505654,0.653669,26.0,3.0
1480100,COOC1(OOCCCCCC(=O)O)CCCCCCCCCCC1,32,0.00,0.01,3.349466,0.250995,25.0,1.0


# Test data: known active molecules provided in
- https://raw.githubusercontent.com/mathcom/multiobj-rationale/master/data/dual_gsk3_jnk3/actives.txt

In [20]:
df_actives = pd.read_csv('actives.txt')
df_test = df_actives.drop_duplicates(subset=('smiles',), ignore_index=True)
df_test

Unnamed: 0,smiles,jnk3,gsk3
0,c1cc2ccc3ncc(-c4ccc(-c5ccn[nH]5)cc4)cc3c2cn1,0.91,0.52
1,c1ccc2c(-c3ccncc3)c[nH]c2c1,0.64,0.81
2,c1ccc2cc(-c3n[nH]cc3-c3ccncc3)ccc2c1,0.74,0.72
3,c1cc(-c2nccs2)c2nc(Nc3ccc(-n4cnc(N5CCOCC5)n4)c...,0.98,0.52
4,c1ccc(-c2ccc3c(-c4ccnc(Nc5ccc6c(c5)OCCO6)n4)cn...,0.51,0.98
...,...,...,...
310,O=[N+]([O-])c1ccc(Nc2nccc(-c3cnn4ncccc34)n2)cc1,0.51,0.99
311,O=[N+]([O-])C=Cc1ccc2c(c1)OCO2,0.63,0.62
312,O=S(=O)(c1cccc2cnccc12)N1CCCNCC1,0.72,0.68
313,O=S(=O)(NCCNCC=Cc1ccc(Br)cc1)c1cccc2cnccc12,0.69,0.90


# Spliting into Training and Test

In [14]:
test_molecules = set(df_test['smiles'].values.tolist())

In [15]:
is_train = df_filtered['smiles'].map(lambda x:x not in test_molecules)
print(is_train.value_counts())

True     1479828
False        274
Name: smiles, dtype: int64


In [17]:
df_train = df_filtered[is_train].reset_index(drop=True).copy()
df_train

Unnamed: 0,smiles,length,gsk3,jnk3,sa,qed,num_atoms,num_rings
0,c1cc(OCCCN2CCCCC2)ccc1CN1CCC2(CC1)OCCO2,39,0.01,0.00,2.692034,0.684442,27.0,4.0
1,CC1COC(c2cccn2Cc2ccccc2Cl)=N1,29,0.06,0.00,3.047999,0.842972,19.0,3.0
2,Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...,60,0.24,0.19,3.263126,0.439582,34.0,5.0
3,Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O,46,0.09,0.12,2.117217,0.552710,27.0,4.0
4,Cn1c(=O)c2c(SCC(=O)N3CCOCC3)nc(-c3ccccc3F)nc2n...,53,0.00,0.01,2.530062,0.432354,31.0,4.0
...,...,...,...,...,...,...,...,...
1479823,CNCCOc1ccc2cc3ccc(OCCNC)cc3nc2c1,32,0.01,0.01,2.260887,0.492394,24.0,3.0
1479824,Cc1nnc2n1-c1c(F)cc(-c3cncc(C(F)(F)F)c3)cc1CC2,45,0.12,0.00,2.810497,0.626775,25.0,4.0
1479825,Cc1c(Cc2cccnc2)c(=O)oc2cc(OC(=O)N(C)C)c(Cl)cc12,47,0.04,0.00,2.505654,0.653669,26.0,3.0
1479826,COOC1(OOCCCCCC(=O)O)CCCCCCCCCCC1,32,0.00,0.01,3.349466,0.250995,25.0,1.0


# Vocabulary check

In [18]:
vocab_tr = set()
for smi in df_train.loc[:,"smiles"]:
    vocab_tr = vocab_tr.union(set(smi))
print(len(vocab_tr))
print(sorted(list(vocab_tr)))

32
['#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', 'B', 'C', 'F', 'H', 'N', 'O', 'P', 'S', '[', ']', 'c', 'l', 'n', 'o', 'p', 'r', 's']


In [21]:
vocab_te = set()
for smi in df_test.loc[:,"smiles"]:
    vocab_te = vocab_te.union(set(smi))
print(len(vocab_te))
print(sorted(list(vocab_te)))

28
['#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '=', 'B', 'C', 'F', 'H', 'N', 'O', 'P', 'S', '[', ']', 'c', 'l', 'n', 'o', 'r', 's']


In [22]:
vocab_te.issubset(vocab_tr)

True

# Save

In [23]:
df_train.to_csv("chembl_train_full.csv", index=False)
df_train.loc[:,"smiles"].to_csv("chembl_train.txt", index=False, header=None)

In [24]:
df_test.to_csv("chembl_test_full.csv", index=False)
df_test.loc[:,"smiles"].to_csv("chembl_test.txt", index=False, header=None)