In [1]:
import csv
import copy
import numpy as np
import pandas as pd

from IPython.display import display

import selfies as sf
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import MolFromSmiles as smi2mol
from rdkit.Chem import MolToSmiles as mol2smi

In [31]:
def filter_mol(mol, max_heavy_atoms=50, min_heavy_atoms=10, element_list=[6,7,8,9,16,17,35]):
    """Filters molecules on number of heavy atoms and atom types"""
    if mol is not None:
        num_heavy = min_heavy_atoms<mol.GetNumHeavyAtoms()<max_heavy_atoms
        elements = all([atom.GetAtomicNum() in element_list for atom in mol.GetAtoms()])
        if num_heavy and elements:
            return True
        else:
            return False
        
def canonicalize_smiles_from_file(fname):
    """Reads a SMILES file and returns a list of RDKIT SMILES"""
    with open(fname, 'r') as f:
        smiles_list = []
        for i, line in enumerate(f):
            if i % 100000 == 0:
                print("{} lines processed.".format(i))
            smiles = line.split(" ")[0]
            mol = smi2mol(smiles)
            if filter_mol(mol):
                smiles_list.append(mol2smi(mol))
        print("{} SMILES retrieved".format(len(smiles_list)))
        return smiles_list
    
def write_smiles_to_file(smiles_list, fname):
    """Write a list of SMILES to a file."""
    with open(fname, 'w') as f:
        for smiles in smiles_list:
            f.write(smiles + "\n")

In [20]:
smiles = []
with open('ChEMBL_filtered', 'r') as f:
    for line in f:
        smiles.append(line.split()[0])

In [21]:
canon_smiles = canonicalize_smiles_from_file('ChEMBL_filtered')

0 lines processed.
100000 lines processed.
200000 lines processed.
300000 lines processed.
400000 lines processed.
500000 lines processed.
600000 lines processed.
700000 lines processed.
800000 lines processed.
900000 lines processed.
1000000 lines processed.
1100000 lines processed.
1179477 SMILES retrieved


In [17]:
chembl_filtered = pd.DataFrame (canon_smiles, columns = ['smiles'])

In [19]:
chembl_selfies = []
for i, c_smiles in enumerate(canon_smiles):
    
    chembl_selfies.append(sf.encoder(c_smiles))
    
    if i%10000==0:
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000


In [None]:
chembl_filtered.insert(0, 'selfies_' + sf.__version__, chembl_selfies, allow_duplicates = False)

In [39]:
chembl_filtered.to_csv('filtered_data/chembl_filtered_' + sf.__version__ + '_.csv')

In [40]:
chembl_filtered

Unnamed: 0,selfies_2.1.0,smiles
0,[C][C][=C][C][=C][C][=Branch1][Ring2][=C][Ring...,Cc1ccc2c(c1)sc1c(=O)[nH]c3ccc(C(=O)NCCCN(C)C)c...
1,[O][=C][Branch1][N][N][C][C][N][C][C][O][C][C]...,O=C(NCCN1CCOCC1)c1cc(C(F)(F)F)cc(C(F)(F)F)c1
2,[C][C][=Branch1][C][=O][C][N][C][=Branch1][C][...,CC(=O)CN1C(=O)C2CC(O)CN2C(=O)c2ccccc21
3,[C][O][C][=C][C][=C][C][Branch2][Ring2][#Branc...,COc1cccc(NC(=O)c2oc3ccccc3c2NC(=O)c2ccc3c(c2)O...
4,[C][O][C][=Branch1][C][=O][C][N][=N][N][Branch...,COC(=O)c1nnn(CC(=O)C(CC2CCNC2=O)NC(=O)C(C)NC(=...
...,...,...
1179472,[N][C][=Branch1][C][=O][C][C][C][=C][C][=C][C]...,NC(=O)CCc1ccc2c(c1)c(CCc1ccccc1)cn2-c1ccccc1
1179473,[C][C][=C][C][=C][Branch2][Ring1][S][S][=Branc...,Cc1ccc(S(=O)(=O)NC(=O)Nc2ccc(S(N)(=O)=O)nc2)cc1
1179474,[C][C][=Branch1][C][=O][N][C][C][Branch1][C][O...,CC(=O)N1CC(O)CC1C(=O)NC(CCCN=C(N)N)C(=O)c1nc2c...
1179475,[C][C][O][C][=C][C][=C][N][=C][Branch2][Ring2]...,CCOc1ccc2nc(N(CCCN(C)C)C(=O)c3ccc(S(=O)(=O)N4C...
