# Clean the raw USPTO dataset 

The raw dataset can be obtained from https://doi.org/10.6084/m9.figshare.5104873  

We remove out reactions that:

- charge is not conserved between reactants and products
- composition is not conserved between reactants and products
- atom mapping between reactants and products is problematic (each atom in the reactants and products should be mapped once and only once)


In [1]:
import pandas as pd
import multiprocessing
from rdkit.Chem import Mol
from collections import defaultdict
from typing import List, Tuple, Dict
from rxnrep.core.molecule import Molecule, MoleculeCreationError
from rxnrep.core.reaction import Reaction, smiles_to_reaction, ReactionSanityCheckError
from rxnrep.dataset.rdkit_utils import get_reaction_atom_mapping
from rxnrep.utils import to_path
from rdkit import RDLogger

RDLogger.logger().setLevel(RDLogger.CRITICAL)  # supress rdkit warnings

In [2]:
def process_one_reaction(rxn, id=None):
    """
    Process reaction smiles to create Reaction and check their correctness.  
    """
    
    # set it to string such that it has the same dtype when errors out or not
    error = 'None'  
    try:
        # create reaction
        reaction = smiles_to_reaction(rxn, id)

        # check composition and charge
        reaction.check_composition()
        reaction.check_charge()

        # create and check atom mapping
        reactants_rdkit_mol = [m.rdkit_mol for m in reaction.reactants]
        products_rdkit_mol = [m.rdkit_mol for m in reaction.products]
        mapping = get_reaction_atom_mapping(reactants_rdkit_mol,
                                            products_rdkit_mol)
        # atom mapping check will called automatically when setting it
        reaction.set_atom_mapping(mapping)

    except (MoleculeCreationError, ReactionSanityCheckError) as e:
        reaction = None
        error = str(e)

    return reaction, error

In [3]:
def filter_out_bad_reactions(filename, nprocs=1):
    """
    Read reactions give in raw format

    Returns:
         succeed reactions: list two-tuple (index, failed_reason)
         failed reactions: list of two-tuple (index, rxn)
    """

    print('Start to read smiles string', flush=True)

    # read smiles reactions
    df = pd.read_csv(filename, sep="\t")
    smiles_rxns = df["ReactionSmiles"].values.tolist()
    # remove the part in f||, like f|0.1.2|
    smiles_rxns = [s.split()[0] for s in smiles_rxns]

    print('Start to convert to reactions', flush=True)

    if nprocs == 1:
        reactions = [
            process_one_reaction(r, i) for i, r in enumerate(smiles_rxns)
        ]
    else:
        ids = list(range(len(smiles_rxns)))
        with multiprocessing.Pool(processes=nprocs) as p:
            reactions = p.starmap(process_one_reaction, zip(smiles_rxns, ids))

    print('Start to group succeeded and failed reactions', flush=True)

    succeeded = []
    failed = []
    for i, (rxn, error) in enumerate(reactions):
        if rxn is not None:
            succeeded.append((i, smiles_rxns[i], rxn))
        else:
            failed.append((i, smiles_rxns[i], error))

    return succeeded, failed

## Get the succeeded and failed reactions

In [4]:
filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi"
path = to_path(filename)
succeeded_rxns, failed_rxns = filter_out_bad_reactions(path, 4)

# filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles.rsmi"
# path = to_path(filename)
# succeeded_rxns, failed_rxns = filter_out_bad_reactions(path, 8)

Start to read smiles string
Start to convert to reactions
Start to group succeeded and failed reactions


### Write succeeded reactions to file

In [5]:
fname = path.parent.joinpath(path.stem + '_succeeded' + path.suffix)
with open(fname, 'w') as f:
    f.write('index\tsmiles\n')
    for i, smi, _ in succeeded_rxns:
        f.write(f'{i}\t{smi}\n')

### Write failed reactions to file

In [6]:
fname = path.parent.joinpath(path.stem + '_failed' + path.suffix)

failed_keywords = ['composition', 'charge']
failed_reason = defaultdict(int)

with open(fname, 'w') as f:
    f.write('index\tsmiles\tfailed_reason\n')
    for i, smi, error in failed_rxns:
        f.write(f'{i}\t{smi}\t{error}\n')

        for kw in failed_keywords:
            if kw in error:
                failed_reason[kw] += 1
                break

print("Failed reason statistics:", failed_reason)

Failed reason statistics: defaultdict(<class 'int'>, {'composition': 194})
