In [1]:
import os
import pandas as pd
import tqdm
from rdkit import Chem

# 1. Raw data

In [2]:
filename = 'all.txt'

In [3]:
df = pd.read_csv(filename, header=None).rename(columns={0:'smiles'})

In [4]:
print(df.shape)
print(df.head())

(1488640, 1)
                                              smiles
0            c1cc(OCCCN2CCCCC2)ccc1CN1CCC2(CC1)OCCO2
1                      CC1COC(c2cccn2Cc2ccccc2Cl)=N1
2  Cc1ncn(-c2ccc(C#N)nc2-c2nc3cc(-c4cnc(N)nc4)ccc...
3     Cc1c(-c2ccc(-c3cccnc3)cc2)nc2ccc(F)cc2c1C(=O)O
4  Cn1c(=O)c2c(SCC(=O)N3CCOCC3)nc(-c3ccccc3F)nc2n...


# 2. Selecting molecules with 9 atoms of importance to pharmaceutical research
- Ref: Rocha, Gerd B., et al. "Rm1: A reparameterization of am1 for h, c, n, o, p, s, f, cl, br, and i." Journal of computational chemistry 27.10 (2006): 1101-1111.
- Iodine is not used in this study because it is not contained in the known active molecules (see actives.txt)

In [5]:
atomicset = {'C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'H', 'P'}

In [6]:
records = []
for i in tqdm.trange(df.shape[0]):
    lnt = 0
    atm = False
    try:
        mol = Chem.MolFromSmiles(df.loc[i, 'smiles'])
        ## length
        smi = Chem.MolToSmiles(mol, isomericSmiles=False)
        lnt = len(smi)
        ## atoms
        atm = {atom.GetSymbol() for atom in mol.GetAtoms()}.issubset(atomicset)
    except:
        pass
    ## check
    if 0 < lnt <= 150 and atm:
        records.append((smi, lnt))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1488640/1488640 [03:39<00:00, 6771.24it/s]


In [7]:
df_res = pd.DataFrame.from_records(records)
df_res = df_res.rename(columns={0:'smiles', 1:'length'})

In [8]:
df_res.to_csv('all_processed.csv', sep=',', index=False)