In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
import re

In [2]:
a = 'O[C:1](=[O:2])[c:3]1[cH:4][c:5]([N+:6](=[O:7])[O-:8])[c:9]([S:10][c:11]2[c:12]([Cl:13])[cH:14][n:15][cH:16][c:17]2[Cl:18])[s:19]1.[NH2:20][c:21]1[cH:22][cH:23][cH:24][c:25]2[cH:26][n:27][cH:28][cH:29][c:30]12>>[C:1](=[O:2])([c:3]1[cH:4][c:5]([N+:6](=[O:7])[O-:8])[c:9]([S:10][c:11]2[c:12]([Cl:13])[cH:14][n:15][cH:16][c:17]2[Cl:18])[s:19]1)[NH:20][c:21]1[cH:22][cH:23][cH:24][c:25]2[cH:26][n:27][cH:28][cH:29][c:30]12'

In [3]:
# Split reactans list and products list from rxn
all_reactants, all_products = a.split('>>')

reactants = all_reactants.split('.')
products = all_products.split('.')
print(reactants)
print(products)

['O[C:1](=[O:2])[c:3]1[cH:4][c:5]([N+:6](=[O:7])[O-:8])[c:9]([S:10][c:11]2[c:12]([Cl:13])[cH:14][n:15][cH:16][c:17]2[Cl:18])[s:19]1', '[NH2:20][c:21]1[cH:22][cH:23][cH:24][c:25]2[cH:26][n:27][cH:28][cH:29][c:30]12']
['[C:1](=[O:2])([c:3]1[cH:4][c:5]([N+:6](=[O:7])[O-:8])[c:9]([S:10][c:11]2[c:12]([Cl:13])[cH:14][n:15][cH:16][c:17]2[Cl:18])[s:19]1)[NH:20][c:21]1[cH:22][cH:23][cH:24][c:25]2[cH:26][n:27][cH:28][cH:29][c:30]12']


In [4]:
# Remove all Atom Map Number of the mols in a list
def remove_all_AtomMapNumber(list):
    unmapped_mol = []
    mapped_mol = [Chem.MolFromSmiles(smi) for smi in list]
    for mol in mapped_mol:
        for a in mol.GetAtoms():
            if a.HasProp('molAtomMapNumber'):
                a.ClearProp('molAtomMapNumber')
        unmapped_mol.append(Chem.MolToSmiles(mol, True)) 
    return unmapped_mol

In [5]:
reactants_unmapped = remove_all_AtomMapNumber(reactants)
products_unmapped = remove_all_AtomMapNumber(products)
print(reactants_unmapped)
print(products_unmapped)

['O=C(O)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1', 'Nc1cccc2cnccc12']
['O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1']


In [6]:
# Merge the reactants_unmapped and products_unmapped  to a rxn

reactants_rxn = '.'.join(reactants_unmapped)
products_rxn = products_unmapped[0]

rxn = reactants_rxn+">>"+products_rxn
print(rxn)

O=C(O)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1.Nc1cccc2cnccc12>>O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1


In [8]:
# Adding AtomMapNumber by RXNMapper
from rxnmapper import RXNMapper
rxn_mapper = RXNMapper()
rxns = ['CC(C)S.CN(C)C=O.Fc1cccnc1F.O=C([O-])[O-].[K+].[K+]>>CC(C)Sc1ncccc1F', 'C1COCCO1.CC(C)(C)OC(=O)CONC(=O)NCc1cccc2ccccc12.Cl>>O=C(O)CONC(=O)NCc1cccc2ccccc12']
results = rxn_mapper.get_attention_guided_atom_maps(rxns)

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [7]:
from rxnmapper import RXNMapper
rxn_mapper = RXNMapper()
results = rxn_mapper.get_attention_guided_atom_maps(rxn)

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
print(remove_all_AtomMapNumber(reactants))

['O=C(O)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1', 'Nc1cccc2cnccc12']


In [10]:
b = 'CCO.[OH-].[OH-].[Pd+2].c1ccc(C[N:1]2[CH2:2][CH2:3][CH:4]([N:5]([CH2:6][CH3:7])[c:8]3[n:9][c:10]([F:11])[cH:12][cH:13][c:14]3[NH:15][CH:16]([CH3:17])[CH3:18])[CH2:19][CH2:20]2)cc1>>[NH:1]1[CH2:2][CH2:3][CH:4]([N:5]([CH2:6][CH3:7])[c:8]2[n:9][c:10]([F:11])[cH:12][cH:13][c:14]2[NH:15][CH:16]([CH3:17])[CH3:18])[CH2:19][CH2:20]1'
all_reactants, all_products = b.split('>>')

In [11]:
products = [Chem.MolFromSmiles(smi) for smi in all_reactants.split('.')] 
products

[<rdkit.Chem.rdchem.Mol at 0x7fc3f5931da0>,
 <rdkit.Chem.rdchem.Mol at 0x7fc3f5931d50>,
 <rdkit.Chem.rdchem.Mol at 0x7fc3f5931d00>,
 <rdkit.Chem.rdchem.Mol at 0x7fc3f5931df0>,
 <rdkit.Chem.rdchem.Mol at 0x7fc3f5931e40>]

In [12]:
for prod in products:
    prod_smi = Chem.MolToSmiles(prod, True)  
    print(prod_smi)

CCO
[OH-]
[OH-]
[Pd+2]
c1ccc(C[N:1]2[CH2:2][CH2:3][CH:4]([N:5]([CH2:6][CH3:7])[c:8]3[n:9][c:10]([F:11])[cH:12][cH:13][c:14]3[NH:15][CH:16]([CH3:17])[CH3:18])[CH2:19][CH2:20]2)cc1


In [15]:
# Create new df from old (minor processing)
classes = []
ids = [] 
rxn_smiles = []
prod_smiles = []
for row in dataSetB.itertuples():                                                       # 遍历的方法
    if row[0] % 5000 == 0:
        print('On index {:d}'.format(int(row[0])))
        
    all_reactants, all_products = row[3].split('>>')
    products = [Chem.MolFromSmiles(smi) for smi in all_products.split('.')]     # 用rdkit把products全都从smiles转换成mol
                                                                                # 把含有"."的产物分成多个单个的products
    # Multiple products = enumerate
    for prod in products:
        
        # Make sure all have atom mapping
        if not all([a.HasProp('molAtomMapNumber') for a in prod.GetAtoms()]):   # 确认每个产物的每个原子是否有atom mapping
            continue                                                            # 如果没有，就跳到下一个product
        
        prod_smi = Chem.MolToSmiles(prod, True)                                 # True：通过使用双键代替小写的碳原子来表示芳香性
        
        # Re-parse reactants for each product so we can clear maps              ### 在第四列的集合中的数字，才是需要的反应物的索引
        reactants = [Chem.MolFromSmiles(smi) for (i, smi) in enumerate(
            all_reactants.split('.')) if i in row[4]]
        
        # Get rid of reactants when they don't contribute to this prod          
        prod_maps = set(re.findall('\:([[0-9]+)\]', prod_smi))                  # 找到含有 :数字] 的， 做成一个map number 集合， 即为 prod_maps
        reactants_smi_list = []                                                 
        for mol in reactants:                                                   # 如果反应物中，有原子有map number，获取它的map number
            used = False                                                        # 若它的map number在prod_maps 中， 则有使用该反应物
            for a in mol.GetAtoms():
                if a.HasProp('molAtomMapNumber'):       
                    if a.GetProp('molAtomMapNumber') in prod_maps:
                        used = True 
                    else:
                        a.ClearProp('molAtomMapNumber')                         # 若它的map number不在prod_maps 中，则清除它的map number
            if used:
                reactants_smi_list.append(Chem.MolToSmiles(mol, True))          # 若有使用该反应物，就将其添加到 reactans_smi_list列表中
                
        reactants_smi = '.'.join(reactants_smi_list)                            # 用'.'将所有的反应物连起来
        
        # Was this just a spectator? Some examples are HCl>>HCl                 # 不参与反应的，从反应物中去掉。
        if reactants_smi == prod_smi:                                           
            continue
        
        # Append to ongoing list                                                # 把反应类型和id拉回来
        classes.append(row[1])
        ids.append(row[2])  
        rxn_smiles.append('{}>>{}'.format(reactants_smi, prod_smi))             # 用 >> 连接 reactants_smi 和 prod_smi ， 添加到rxn_smiles
        # Save non-mapped prod too
        [a.ClearProp('molAtomMapNumber') for a in prod.GetAtoms()]              
        prod_smiles.append(Chem.MolToSmiles(prod, True))                        # 将去掉 map number 的  产物， 添加到 prod_smiels
        
data = pd.DataFrame({'class': classes, 
                     'id': ids, 
                     'rxn_smiles': rxn_smiles,
                     'prod_smiles': prod_smiles})

On index 0




On index 5000
On index 10000
On index 15000
On index 20000
On index 25000
On index 30000
On index 35000
On index 40000
On index 45000


In [9]:
data['class'].value_counts()

1     15247
2     11906
6      8237
3      5666
7      4614
9      1834
4       909
8       811
5       672
10      230
Name: class, dtype: int64

In [19]:
# Find most popular product smiles (probably frags/salts)
from collections import Counter
prod_smi_counter = Counter(data['prod_smiles'])                 
print(prod_smi_counter.most_common(25))                         # 还有这种用法！

[('[Br-]', 48), ('[I-]', 26), ('[Cl-]', 19), ('Cl', 17), ('CC(C)(C)OC(=O)N1CCC(CO)CC1', 6), ('Cc1cccc(C2CC2)c1Oc1nnc(Cl)cc1O', 4), ('Cc1ccc(-c2ccccc2C#N)cc1', 4), ('COc1cc2nccc(Oc3ccc(N)cc3)c2cc1OC', 4), ('COC(=O)c1cccc(N)c1N', 4), ('CN(C)CCn1cc(B2OC(C)(C)C(C)(C)O2)cn1', 3), ('OC(c1ccccc1)c1ccccc1', 3), ('COc1ccc(-c2ccccc2)cc1', 3), ('O=C(NC(CC1C(=O)Nc2ccccc21)C(=O)O)c1ccc(Cl)cc1', 3), ('c1ccc(Pc2ccccc2)cc1', 3), ('Cc1noc(NS(=O)(=O)c2cc(Cl)ccc2Cl)c1Br', 3), ('Cc1ccc(N)c(N)n1', 3), ('O=C(NCCN1CCOCC1)c1ccc(Cl)cc1', 3), ('CC(C)(C)OC(=O)N1CCC(CCO)CC1', 3), ('CC(C)(C)OC(=O)N1CCN(Cc2ccc(Br)cc2)CC1', 3), ('O=C(O)CCCCCNC(=O)C(F)(F)F', 3), ('CC(C)(C)OC(=O)c1ccc(CBr)cc1', 3), ('CC(C)(C)OC(=O)N[C@@H](CC(=O)N1CCn2c(nnc2C(F)(F)F)C1)Cc1cc(F)c(F)cc1F', 3), ('c1ccc(C(c2ccccc2)(c2ccccc2)n2ccnc2)cc1', 3), ('Fc1cc(Br)ccc1OCc1ccccc1', 3), ('Nc1cc(Br)ccc1O', 3)]


In [23]:
data.head()

Unnamed: 0,class,id,rxn_smiles,prod_smiles,prod_smiles_pop,keep
0,6,US05849732,O=C(OCc1ccccc1)[NH:1][CH2:2][CH2:3][CH2:4][CH2...,COC(=O)[C@H](CCCCN)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O,1,True
1,2,US20120114765A1,O[C:1](=[O:2])[c:3]1[cH:4][c:5]([N+:6](=[O:7])...,O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(C...,1,True
2,1,US08003648B2,O=[CH:1][c:2]1[cH:3][cH:4][c:5](-[c:6]2[n:7][c...,CCN(CC)Cc1ccc(-c2nc(C)c(COc3ccc([C@H](CC(=O)N4...,1,True
3,1,US09045475B2,O=[C:1]([CH2:2][F:3])[CH2:4][F:5].[CH3:6][C:7]...,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,1,True
4,2,US08188098B2,Cl[C:1](=[O:2])[O:3][CH:4]1[CH2:5][CH2:6][CH2:...,CCOc1ccc(Oc2ncnc3c2cnn3C2CCN(C(=O)OC3CCCC3)CC2...,1,True


In [22]:
# counter 统计 各product 出现的次数，并保留 出现次数小于10次， 且 prod_smiles 长度大于5 的产物。 
data['prod_smiles_pop'] = [prod_smi_counter[smi] for smi in data['prod_smiles']]
data['keep'] = [x[5] < 10 and  
                len(x[4]) >= 5 
                for x in data.itertuples()]

data.loc[data['keep']]['class'].value_counts()

1     15150
2     11893
6      8237
3      5661
7      4613
9      1834
4       909
8       811
5       672
10      230
Name: class, dtype: int64

In [12]:
data.loc[data['keep']].to_csv('../data/data_processed1.csv')