In [None]:
%cd ..

# Prepare USPTO-50k for single-step retrosynthesis

In [2]:
# load the dataset
import pandas as pd
# or can be downloaded here: https://github.com/connorcoley/retrosim/raw/master/retrosim/data/data_processed.csv
df = pd.read_csv('https://github.com/connorcoley/retrosim/raw/master/retrosim/data/data_processed.csv', index_col=0)
df = df.sample(len(df), random_state=42) # shuffle the dataset
df.head()

Unnamed: 0,class,id,prod_smiles,rxn_smiles,prod_smiles_pop,keep
10521,1,US20100317582A1,C[C@@H](NC1CCCC(c2ccccn2)C1)c1cccc2ccccc12,O=[C:1]1[CH2:2][CH2:3][CH2:4][CH:5]([c:6]2[cH:...,1,True
25492,1,US05932582,Cc1cc(OCC(N)=O)ccc1NC(=O)OC(C)(C)C,Br[CH2:1][C:2]([NH2:3])=[O:4].[CH3:5][c:6]1[cH...,1,True
46997,7,US05266570,Nc1cnc(NC2CCN(CC34CC(c5ccccc53)c3ccccc34)CC2)nc1N,O=[N+:1]([O-])[c:2]1[c:3]([NH2:4])[n:5][c:6]([...,1,True
15580,6,US20080181866A1,O=C(O)C1CS[C@H](C2CCCNC2)N1C(=O)OCc1ccccc1,CC(C)(C)OC(=O)[N:1]1[CH2:2][CH2:3][CH2:4][CH:5...,1,True
6961,7,US07229987B2,CN(C)CCCOc1ccc(N)cc1,O=[N+:1]([O-])[c:2]1[cH:3][cH:4][c:5]([O:6][CH...,1,True


In [3]:
import numpy as np
def split_data_df(data, val_frac=0.1, test_frac=0.1, shuffle=False, seed=None):
    """edited from https://github.com/connorcoley/retrosim/blob/master/retrosim/data/get_data.py"""
    # Define shuffling
    if shuffle:
        if seed is None:
            np.random.seed(int(time.time()))
        else:
            np.random.seed(seed)
        def shuffle_func(x):
            np.random.shuffle(x)
    else:
        def shuffle_func(x):
            pass

    # Go through each class
    classes = sorted(np.unique(data['class']))
    for class_ in classes:
        indeces = data.loc[data['class'] == class_].index
        N = len(indeces)
        print('{} rows with class value {}'.format(N, class_))

        shuffle_func(indeces)
        train_end = int((1.0 - val_frac - test_frac) * N)
        val_end = int((1.0 - test_frac) * N)

        for i in indeces[:train_end]:
            data.at[i, 'dataset'] =  'train'
        for i in indeces[train_end:val_end]:
            data.at[i, 'dataset'] =  'valid'
        for i in indeces[val_end:]:
            data.at[i, 'dataset'] =  'test'
    print(data['dataset'].value_counts())

In [4]:
# split the data just like in retrosim (don't know the seed though) # shuffle throws error
#from retrosim.data.get_data import split_data_df
split_data_df(df, shuffle=False, seed=42) # 80/10/10 within each class
df.head()

15151 rows with class value 1
11896 rows with class value 2
5662 rows with class value 3
909 rows with class value 4
672 rows with class value 5
8237 rows with class value 6
4614 rows with class value 7
811 rows with class value 8
1834 rows with class value 9
230 rows with class value 10
train    40008
test      5007
valid     5001
Name: dataset, dtype: int64


Unnamed: 0,class,id,prod_smiles,rxn_smiles,prod_smiles_pop,keep,dataset
10521,1,US20100317582A1,C[C@@H](NC1CCCC(c2ccccn2)C1)c1cccc2ccccc12,O=[C:1]1[CH2:2][CH2:3][CH2:4][CH:5]([c:6]2[cH:...,1,True,train
25492,1,US05932582,Cc1cc(OCC(N)=O)ccc1NC(=O)OC(C)(C)C,Br[CH2:1][C:2]([NH2:3])=[O:4].[CH3:5][c:6]1[cH...,1,True,train
46997,7,US05266570,Nc1cnc(NC2CCN(CC34CC(c5ccccc53)c3ccccc34)CC2)nc1N,O=[N+:1]([O-])[c:2]1[c:3]([NH2:4])[n:5][c:6]([...,1,True,train
15580,6,US20080181866A1,O=C(O)C1CS[C@H](C2CCCNC2)N1C(=O)OCc1ccccc1,CC(C)(C)OC(=O)[N:1]1[CH2:2][CH2:3][CH2:4][CH:5...,1,True,train
6961,7,US07229987B2,CN(C)CCCOc1ccc(N)cc1,O=[N+:1]([O-])[c:2]1[cH:3][cH:4][c:5]([O:6][CH...,1,True,train


In [5]:
import hashlib
def create_hash(pd_row):
    return hashlib.md5(pd_row.to_json().encode()).hexdigest()

if '_id' not in df.columns:
    df['_id'] = df.apply(create_hash, axis=1)

In [6]:
df.rename(columns={'rxn_smiles':'reaction_smiles'}, inplace=True)
df.rename(columns={'dataset':'split'}, inplace=True)

In [7]:
reactants, spectators, products = list(zip(*[s.split('>') for s in df['reaction_smiles']]))
df['reactants'] = reactants
df['spectators'] = spectators
df['products'] = products

In [8]:
# extract templates

from multiprocessing import Pool
from rdchiral.template_extractor import extract_from_reaction

reaction_dicts = [row.to_dict() for i, row in df.iterrows()]
with Pool(32) as pool:
    res = pool.map(extract_from_reaction, reaction_dicts)

In [9]:
assert list(df._id) == [r['reaction_id'] for r in res]
reaction_smarts = [r['reaction_smarts'] for r in res]
df['reaction_smarts'] = reaction_smarts

In [10]:
# canonicalize reactant (optionally product_can_from_reaction)
from mhnreact.retroeval import canonicalize_reactants
df['reactants_can'] = [canonicalize_reactants(r, can_steps=2) for r in df['reactants']]

In [11]:
def filter_by_dict(df, fil):
    for col, value in fil.items():
        if not isinstance(value, list):
            value = [value]
        df = df[df[col].isin(value)]
    return df

In [12]:
import re

mre = ':\d+(?=])'
unmapped = [re.sub(mre,'',r) for r in df['reaction_smarts']]
df['unmapped_template'] = unmapped

unmapped2idx = {}
labels = []
for split in ['train', 'valid', 'test']:
    sub = filter_by_dict(df, {'split': split})
    for u in sub['unmapped_template']:
        if u not in unmapped2idx:
            label = len(unmapped2idx)
            unmapped2idx[u] = label
            
df['label'] = [unmapped2idx[u] for u in df['unmapped_template']]

In [13]:
df.iloc[0]

class                                                                1
id                                                     US20100317582A1
prod_smiles                 C[C@@H](NC1CCCC(c2ccccn2)C1)c1cccc2ccccc12
reaction_smiles      O=[C:1]1[CH2:2][CH2:3][CH2:4][CH:5]([c:6]2[cH:...
prod_smiles_pop                                                      1
keep                                                              True
split                                                            train
_id                                   20f7e94253448bd4e0c5edb1c421eea0
reactants            O=[C:1]1[CH2:2][CH2:3][CH2:4][CH:5]([c:6]2[cH:...
spectators                                                            
products             [CH:1]1([NH:15][C@H:14]([CH3:13])[c:16]2[cH:17...
reaction_smarts      [C:2]-[CH;D3;+0:1](-[C:3])-[NH;D2;+0:5]-[C:4]>...
reactants_can            C[C@@H](N)c1cccc2ccccc12.O=C1CCCC(c2ccccn2)C1
unmapped_template    [C]-[CH;D3;+0](-[C])-[NH;D2;+0]-[C]>>O=[C;H0;D...
label 

In [14]:
# all the relevant data is now in here
df_rel = df[['id','class','prod_smiles','reactants_can','split', 'reaction_smarts', 'label']]

In [15]:
df_rel.to_csv('./data/USPTO_50k_MHN_prepro_recre.csv.gz') # note it's not the same file-size due to e.g. time-split is missing

In [None]:
#df.to_csv('./data/USPTO_50k_MHN_prepro_allcol.csv') # all columns