# Clean the raw USPTO dataset 

The raw dataset can be obtained from https://doi.org/10.6084/m9.figshare.5104873  


In [1]:
import multiprocessing
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rxnrep.dataset.rdkit_utils import (
    check_all_reactions_atom_mapped, check_all_reactions_bonds_mapped,
    adjust_reagents, adjust_atom_map_number, get_reaction_bond_change,
    set_all_H_to_explicit, edit_molecule, MoleculeCreationError,
    AtomMapNumberError, get_atom_property_as_dict)
from rxnrep.utils import to_path
from rdkit import RDLogger
from rdkit.Chem import KekulizeException, AtomValenceException, AtomKekulizeException
from typing import Union, Tuple

RDLogger.logger().setLevel(RDLogger.CRITICAL)  # supress rdkit warnings

In [2]:
def read_file(filename):
    """
    Read smiles reactions and labels. 
    """
    # read smiles reactions
    df = pd.read_csv(filename, sep="\t")
    smiles_rxns = df["ReactionSmiles"].values.tolist()
    # remove the part in f||, like f|0.1.2|
    smiles_rxns = [s.split()[0] for s in smiles_rxns]

    return smiles_rxns

In [3]:
def plot_smiles_reaction(reaction, filename):
    """
    Plot a smiles reaction to file.
    """
    rxn = AllChem.ReactionFromSmarts(reaction, useSmiles=True)
    image = Chem.Draw.ReactionToImage(rxn)
    image.save(filename)

In [4]:
def get_bond_change(
        reaction: str) -> Tuple[Union[str, None], Union[None, str]]:
    """
    Bet bond changes of a reaction.
    """

    # Step 1, adjust reagents
    try:
        rxn_smi = adjust_reagents(reaction)
    except (MoleculeCreationError, AtomMapNumberError) as e:
        return None, str(e).rstrip()

    # Step 2, adjust atom mapping
    try:
        reactants_smi, reagents_smi, products_smi = rxn_smi.strip().split(">")
        reactants = Chem.MolFromSmiles(reactants_smi)
        products = Chem.MolFromSmiles(products_smi)
        reactants, products = adjust_atom_map_number(reactants, products)
    except AtomMapNumberError as e:
        return None, str(e).rstrip()

    # Step 3, get bond changes
    bond_changes = get_reaction_bond_change(reactants,
                                            products,
                                            use_mapped_atom_index=True)
    bond_changes = ";".join(
        ['-'.join([str(i) for i in x]) for x in bond_changes])

    rxn = '>'.join([
        Chem.MolToSmiles(reactants), reagents_smi,
        Chem.MolToSmiles(products)
    ])

    return (rxn, bond_changes), None

In [5]:
def canonicalize_smiles_reaction(
        reaction: str) -> Tuple[Union[str, None], Union[None, str]]:
    """
    Canonicalize a smiles reaction to make reactants and products have the same
    composition.

    This ensures the reactants and products have the same composition, achieved in the
    below steps:

    1. remove reactant molecules from reactants none of their atoms are present in
       the products
    2. adjust atom mapping between reactants and products and add atom mapping number
       for reactant atoms without a mapping number (although there is no corresponding
       atom in the products)
    3. create new products by editing the reactants: removing bonds in the reactants
       but not in the products and adding bonds not in the reactants but in the products

    Args:
        reaction: smiles representation of a reaction

    Returns:
        reaction: canonicalized smiles reaction, `None` if canonicalize failed
        error: error message, `None` if canonicalize succeed
    """

    # Step 1, adjust reagents
    try:
        rxn_smi = adjust_reagents(reaction)
    except (MoleculeCreationError, AtomMapNumberError) as e:
        return None, str(e).rstrip()

    # Step 2, adjust atom mapping
    try:
        reactants_smi, reagents_smi, products_smi = rxn_smi.strip().split(">")
        reactants = Chem.MolFromSmiles(reactants_smi)
        products = Chem.MolFromSmiles(products_smi)
        reactants, products = adjust_atom_map_number(reactants, products)
    except AtomMapNumberError as e:
        return None, str(e).rstrip()

    # Step 3, create new products
    try:
        bond_changes = get_reaction_bond_change(reactants, products)
        products_atom_prop = get_atom_property_as_dict(products)
        new_products = edit_molecule(reactants, bond_changes,
                                     products_atom_prop)
    except (KekulizeException, AtomValenceException,
            AtomKekulizeException) as e:
        return None, str(e).rstrip()

    # write canonicalized reaction to smiles
    reactants_smi = Chem.MolToSmiles(set_all_H_to_explicit(reactants))
    products_smi = Chem.MolToSmiles(set_all_H_to_explicit(new_products))
    canoical_reaction = ">".join([reactants_smi, reagents_smi, products_smi])

    return canoical_reaction, None

In [6]:
def runner(reactions, func, nprocs=1):
    if nprocs == 1:
        canonical_rxns = [func(rxn) for rxn in reactions]
    else:
        with multiprocessing.Pool(nprocs) as p:
            canonical_rxns = p.map(func, reactions)

    succeeded = []
    failed = []
    for i, (value, error) in enumerate(canonical_rxns):
        if error is not None:
            failed.append((i, error))
        else:
            succeeded.append((i, value))

    return succeeded, failed

### read smiles reaction

In [7]:
#filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles.rsmi"
#filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi"
filename = '/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi'
path = to_path(filename)
smiles_rxns = read_file(filename)

### Canonicalize the reactions

In [8]:
# NOTE, python 3.8 multiprocessing default to spawn to create process on macOS.
# this does not work in Jupyter. So set it to `fork`.
if __name__ == '__main__':
    multiprocessing.set_start_method('fork', force=True)
    succeeded, failed = runner(smiles_rxns,
                               canonicalize_smiles_reaction,
                               nprocs=4)

In [9]:
# write succeeded to file

fname = path.parent.joinpath(path.stem + '_succeeded' + path.suffix)
with open(fname, 'w') as f:
    f.write('index\toriginal_reaction\tcanonical_reaction\n')
    for i, smi in succeeded:
        f.write(f'{i}\t{smiles_rxns[i]}\t{smi}\n')

#         # save iamge
#         fname = fname = path.parent.joinpath('image', f'{i}_original' + '.png')
#         plot_smiles_reaction(smiles_rxns[i], fname)
#         fname = fname = path.parent.joinpath('image', f'{i}_edited' + '.png')
#         plot_smiles_reaction(smi, fname)

In [10]:
# write failed to file

fname = path.parent.joinpath(path.stem + '_failed' + path.suffix)
with open(fname, 'w') as f:
    f.write('index\toriginal_reaction\terror\n')
    for i, error in failed:
        f.write(f'{i}\t{smiles_rxns[i]}\t{error}\n')

In [11]:
# generate dataset for training

fname = path.parent.joinpath(path.stem + '_processed.tsv')
with open(fname, 'w') as f:
    f.write('reaction\tlabel\n')
    for i, smi in succeeded:
        label = 'to be filled'
        f.write(f'{smi}\t{label}\n')

### Get changed bonds

In [12]:
# succeeded, failed = runner(smiles_rxns, get_bond_change, nprocs=4)

# # write succeeded to file

# fname = path.parent.joinpath(path.stem + '_succeeded_changes' + path.suffix)
# with open(fname, 'w') as f:
#     f.write('index\toriginal_reaction\tcanonical_reaction\tbond_change\n')
#     for i, (rxn, bond_change) in succeeded:
#         f.write(f'{i}\t{smiles_rxns[i]}\t{rxn}\t{bond_change}\n')

### check reactions atom mapping

In [13]:
# mapped = check_all_reactions_atom_mapped(smiles_rxns,
#                                          nprocs=6,
#                                          print_result=True)

In [14]:
# mapped = check_all_reactions_bonds_mapped(smiles_rxns,
#                                          nprocs=6,
#                                          print_result=True)