In [1]:
import os
import pandas as pd
import tqdm
from rdkit import Chem

# 1. Raw data

In [2]:
input_dir = 'raw'
filenames = os.listdir(input_dir)

In [3]:
frames = []
for filename in filenames:
    df = pd.read_csv(os.path.join(input_dir, filename), sep='\t')
    print(f'[{filename}] : {df.shape}')
    frames.append(df)

[KIAA.txt] : (6725, 9)
[KDBC.txt] : (22, 9)
[KAAA.txt] : (1574, 9)
[KFBD.txt] : (379, 9)
[KABA.txt] : (69, 9)
[KIBD.txt] : (181, 9)
[KDBB.txt] : (536, 9)
[KFAC.txt] : (292, 9)
[KFCA.txt] : (246, 9)
[KBCA.txt] : (63, 9)
[KJAD.txt] : (1941, 9)
[KJBC.txt] : (26, 9)
[KBCB.txt] : (172, 9)
[KHAC.txt] : (299, 9)
[KDCC.txt] : (58, 9)
[KHCA.txt] : (575, 9)
[KFAA.txt] : (5255, 9)
[KCAC.txt] : (301, 9)
[KECD.txt] : (13, 9)
[KCAA.txt] : (2917, 9)
[KICA.txt] : (753, 9)
[KECA.txt] : (123, 9)
[KBAB.txt] : (2125, 9)
[KBAA.txt] : (1106, 9)
[KCBB.txt] : (202, 9)
[KJAC.txt] : (405, 9)
[KFCD.txt] : (6, 9)
[KDCA.txt] : (162, 9)
[KHAA.txt] : (6536, 9)
[KHBA.txt] : (451, 9)
[KBCD.txt] : (5, 9)
[KKAC.txt] : (3013, 9)
[KGCC.txt] : (39, 9)
[KHBD.txt] : (280, 9)
[KDAB.txt] : (9409, 9)
[KABB.txt] : (98, 9)
[KKCB.txt] : (78459, 9)
[KFBB.txt] : (1734, 9)
[KCAB.txt] : (3862, 9)
[KHCD.txt] : (11, 9)
[KKAB.txt] : (417216, 9)
[KKBB.txt] : (34005, 9)
[KBAD.txt] : (145, 9)
[KCCC.txt] : (34, 9)
[KECB.txt] : (708, 9)
[KCCA

In [4]:
df_merged = pd.concat(frames, ignore_index=True)

In [5]:
print(df_merged.shape)
print(df_merged.head())

(1029500, 9)
                                              smiles    zinc_id  \
0  COc1cc(C(=O)N[C@H](NC(=S)Nc2ccc(C)cc2)C(Cl)(Cl...    1218802   
1  CCCCN1C(=O)/C(=c2\sc3n(c2=O)[C@@H](c2cccs2)C(C...  100872142   
2  Cc1ccccc1-c1cccc(CNc2cc(C(=O)NC3CC3)cc(S(=O)(=...  102395685   
3  Cc1cc(C)c(S(=O)(=O)N(CC(=O)N[C@@H]2CCCC[C@H]2C...  230062739   
4  COc1cc2c(c(OC)c1OC)-c1ccc(N[C@@H](C)C(=O)Nc3cc...  604406326   

                      inchikey      mwt   logp  reactive  purchasable  \
0  IITNLGRVWMWOSO-GOSISDBHSA-N  506.839  4.434         0           50   
1  HVUVUUUKHXJAFU-FEYWKWKCSA-N  535.691  4.013         0           50   
2  NJBNXWGVFKHKFY-QHCPKHFHSA-N  546.693  4.052         0           50   
3  IBEPCVOWDIHUJL-CJFMBICVSA-N  538.714  4.300         0           50   
4  ITOLYQNCUZRIDN-SBUREZEXSA-N  585.661  4.189         0           50   

  tranche_name  features  
0         KIAA       NaN  
1         KIAA       NaN  
2         KIAA       NaN  
3         KIAA       NaN  
4         

# 2. Selecting molecules with 9 atoms of importance to pharmaceutical research
- Ref: Rocha, Gerd B., et al. "Rm1: A reparameterization of am1 for h, c, n, o, p, s, f, cl, br, and i." Journal of computational chemistry 27.10 (2006): 1101-1111.

In [6]:
atomicset = {'C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'H', 'P'}

In [7]:
records = []
for i in tqdm.trange(df_merged.shape[0]):
    lnt = 0
    atm = False
    idx = df_merged.loc[i, 'zinc_id']
    mwt = df_merged.loc[i, 'mwt']
    lgp = df_merged.loc[i, 'logp']
    try:
        mol = Chem.MolFromSmiles(df_merged.loc[i, 'smiles'])
        ## length
        smi = Chem.MolToSmiles(mol, isomericSmiles=False)
        lnt = len(smi)
        ## atoms
        atm = {atom.GetSymbol() for atom in mol.GetAtoms()}.issubset(atomicset)
    except:
        pass
    ## check
    if 0 < lnt <= 150 and atm:
        records.append((idx, smi, mwt, lgp, lnt))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1029500/1029500 [03:39<00:00, 4686.21it/s]


In [8]:
df_res = pd.DataFrame.from_records(records)
df_res = df_res.rename(columns={0:'zinc_id', 1:'smiles', 2:'mwt', 3:'logp', 4:'length'})

In [9]:
df_res.to_csv('zinc15_raw_to_canonical.csv', sep=',', index=False)