In [1]:
from __future__ import print_function, absolute_import
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import _pickle as cPickle
import numpy as np
import re

In [2]:
# Load data from Schneider's 50k dataset
dataSetB = pd.read_csv('../data/from_schneider/dataSetB.csv')
dataSetB['reactantSet_NameRxn'] = [eval(x) for x in dataSetB['reactantSet_NameRxn']]                                           #???

In [3]:
# Class stats
dataSetB['rxn_Class'].value_counts()

1     15140
2     11889
6      8232
3      5654
7      4610
9      1842
4       909
8       821
5       672
10      231
Name: rxn_Class, dtype: int64

In [4]:
# Create new df from old (minor processing)
classes = []
ids = [] 
rxn_smiles = []
prod_smiles = []
for row in dataSetB.itertuples():
    if row[0] % 5000 == 0:
        print('On index {:d}'.format(int(row[0])))
        
    all_reactants, all_products = row[3].split('>>')
    products = [Chem.MolFromSmiles(smi) for smi in all_products.split('.')]
    
    # Multiple products = enumerate
    for prod in products:
        
        # Make sure all have atom mapping
        if not all([a.HasProp('molAtomMapNumber') for a in prod.GetAtoms()]):
            continue
        
        prod_smi = Chem.MolToSmiles(prod, True)
        
        # Re-parse reactants for each product so we can clear maps
        reactants = [Chem.MolFromSmiles(smi) for (i, smi) in enumerate(
            all_reactants.split('.')) if i in row[4]]                                                                           #???
        
        # Get rid of reactants when they don't contribute to this prod
        prod_maps = set(re.findall('\:([[0-9]+)\]', prod_smi))    
        reactants_smi_list = []
        for mol in reactants:
            used = False
            for a in mol.GetAtoms():
                if a.HasProp('molAtomMapNumber'):
                    if a.GetProp('molAtomMapNumber') in prod_maps:
                        used = True 
                    else:
                        a.ClearProp('molAtomMapNumber')
            if used:
                reactants_smi_list.append(Chem.MolToSmiles(mol, True))
                
        reactants_smi = '.'.join(reactants_smi_list)
        
        # Was this just a spectator? Some examples are HCl>>HCl
        if reactants_smi == prod_smi:
            continue
        
        # Append to ongoing list
        classes.append(row[1])
        ids.append(row[2])
        rxn_smiles.append('{}>>{}'.format(reactants_smi, prod_smi))
        # Save non-mapped prod too
        [a.ClearProp('molAtomMapNumber') for a in prod.GetAtoms()]
        prod_smiles.append(Chem.MolToSmiles(prod, True))
        
data = pd.DataFrame({'class': classes, 
                     'id': ids, 
                     'rxn_smiles': rxn_smiles,
                     'prod_smiles': prod_smiles})

On index 0
On index 5000
On index 10000
On index 15000
On index 20000
On index 25000
On index 30000
On index 35000
On index 40000
On index 45000


In [5]:
data['class'].value_counts()                                                                         

1     15247
2     11906
6      8237
3      5666
7      4614
9      1834
4       909
8       811
5       672
10      230
Name: class, dtype: int64

In [6]:
# Find most popular product smiles (probably frags/salts)
from collections import Counter
prod_smi_counter =  Counter(data['prod_smiles'])
print(prod_smi_counter.most_common(25))

[('[Br-]', 48), ('[I-]', 26), ('[Cl-]', 19), ('Cl', 17), ('CC(C)(C)OC(=O)N1CCC(CO)CC1', 6), ('Cc1cccc(C2CC2)c1Oc1nnc(Cl)cc1O', 4), ('Cc1ccc(-c2ccccc2C#N)cc1', 4), ('COc1cc2nccc(Oc3ccc(N)cc3)c2cc1OC', 4), ('COC(=O)c1cccc(N)c1N', 4), ('CN(C)CCn1cc(B2OC(C)(C)C(C)(C)O2)cn1', 3), ('OC(c1ccccc1)c1ccccc1', 3), ('COc1ccc(-c2ccccc2)cc1', 3), ('O=C(NC(CC1C(=O)Nc2ccccc21)C(=O)O)c1ccc(Cl)cc1', 3), ('c1ccc(Pc2ccccc2)cc1', 3), ('Cc1noc(NS(=O)(=O)c2cc(Cl)ccc2Cl)c1Br', 3), ('Cc1ccc(N)c(N)n1', 3), ('O=C(NCCN1CCOCC1)c1ccc(Cl)cc1', 3), ('CC(C)(C)OC(=O)N1CCC(CCO)CC1', 3), ('CC(C)(C)OC(=O)N1CCN(Cc2ccc(Br)cc2)CC1', 3), ('O=C(O)CCCCCNC(=O)C(F)(F)F', 3), ('CC(C)(C)OC(=O)c1ccc(CBr)cc1', 3), ('CC(C)(C)OC(=O)N[C@@H](CC(=O)N1CCn2c(nnc2C(F)(F)F)C1)Cc1cc(F)c(F)cc1F', 3), ('c1ccc(C(c2ccccc2)(c2ccccc2)n2ccnc2)cc1', 3), ('Fc1cc(Br)ccc1OCc1ccccc1', 3), ('Nc1cc(Br)ccc1O', 3)]


In [7]:
data['prod_smiles_pop'] = [prod_smi_counter[smi] for smi in data['prod_smiles']]
data['keep'] = [x[5] < 10 and 
                len(x[4]) >= 5 for
                x in data.itertuples()]
data.loc[data['keep']]['class'].value_counts()                                                              

1     15150
2     11893
6      8237
3      5661
7      4613
9      1834
4       909
8       811
5       672
10      230
Name: class, dtype: int64

In [8]:
data.loc[data['keep']].to_csv('../data/data_processed.csv')