# Clean the raw USPTO dataset 

The raw dataset can be obtained from https://doi.org/10.6084/m9.figshare.5104873  


In [1]:
import multiprocessing
import numpy as np
import pandas as pd
from typing import Union, Tuple, List

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import AllChem

from rxnrep.dataset.uspto_utils import (
    check_all_reactions_atom_mapped,
    check_all_reactions_bonds_mapped,
    adjust_reagents,
    adjust_atom_map_number,
    get_reaction_bond_change,
    MoleculeCreationError,
    AtomMapNumberError,
    canonicalize_smiles_reaction,
    canonicalize_smiles_reaction_by_adding_nonexist_atoms_and_bonds,
)
from rxnrep.data.grapher import (
    get_atom_distance_to_reaction_center,
    get_bond_distance_to_reaction_center,
)
from rxnrep.core.molecule import MoleculeError
from rxnrep.core.reaction import smiles_to_reaction, ReactionError
from rxnrep.utils import to_path

RDLogger.logger().setLevel(RDLogger.CRITICAL)  # supress rdkit warnings

Using backend: pytorch


In [2]:
def plot_smiles_reaction(reaction, filename):
    """
    Plot a smiles reaction to file.
    """
    rxn = AllChem.ReactionFromSmarts(reaction, useSmiles=True)
    image = Chem.Draw.ReactionToImage(rxn)
    image.save(filename)

In [3]:
def split_train_val_test(data: List, val_ratio=0.1, test_ratio=0.1, random_seed=35):
    """
    Split dataset into training, validation, and test test.
    """
    assert val_ratio + test_ratio < 1.0, "validation + test >= 1"
    size = len(data)
    num_val = int(size * val_ratio)
    num_test = int(size * test_ratio)
    num_train = size - num_val - num_test

    if random_seed is not None:
        np.random.seed(random_seed)

    idx = np.random.permutation(size)
    train_idx = idx[:num_train]
    val_idx = idx[num_train : num_train + num_val]
    test_idx = idx[num_train + num_val :]

    train_set = [data[i] for i in train_idx]
    val_set = [data[i] for i in val_idx]
    test_set = [data[i] for i in test_idx]

    return train_set, val_set, test_set

In [4]:
def check_convert_to_mp_core_reaction(smi_rxn):
    """
    Check whether a smile reaction can be converted to core reaction.
    """

    try:
        rxn = smiles_to_reaction(
            smi_rxn, smi_rxn, ignore_reagents=True, sanity_check=False
        )
        # rxn.check_charge()  # we ignore the ones (it does not matter since we do not use charge in the feature)
        rxn.check_composition()
        rxn.check_atom_map_number()
    except (MoleculeError, ReactionError) as e:
        return None, "Fail converting to core reaction: " + str(e)

    # check we can get labels
    try:
        get_atom_distance_to_reaction_center(rxn)
        get_bond_distance_to_reaction_center(rxn)
    except AssertionError as e:
        return None, "Fail converting to core reaction: " + str(e)

    return {"smi_rxn": smi_rxn, "core_rxn": rxn}, None

In [5]:
def runner(reactions, reaction_ids, func, nprocs=1):
    """
    Run with multiprocess. 
    """
    if nprocs == 1:
        processed_rxns = [func(rxn) for rxn in reactions]
    else:
        with multiprocessing.Pool(nprocs) as p:
            processed_rxns = p.map(func, reactions)

    succeeded = []
    succeeded_ids = []
    failed = []
    failed_ids = []
    for i, (value, error) in enumerate(processed_rxns):
        iid = reaction_ids[i]

        if error is not None:
            failed.append(error)
            failed_ids.append(iid)
        else:
            succeeded.append(value)
            succeeded_ids.append(iid)

    return succeeded, succeeded_ids, failed, failed_ids

### Read smiles reaction

In [6]:
def read_file(filename):
    """
    Read smiles reactions and labels. 
    """
    # read smiles reactions
    df = pd.read_csv(filename, sep="\t")
    smiles_rxns = df["ReactionSmiles"].values.tolist()
    # remove the part in f||, like f|0.1.2|
    smiles_rxns = [s.split()[0] for s in smiles_rxns]

    return smiles_rxns

In [7]:
# filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles.rsmi"
# filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi"
filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi"

path = to_path(filename)
smiles_rxns = read_file(filename)

### Canonicalize the reactions (first check)

In [8]:
# NOTE, python 3.8 multiprocessing default to spawn to create process on macOS.
# this does not work in Jupyter. So set it to `fork`.

if __name__ == "__main__":
    multiprocessing.set_start_method("fork", force=True)

    reaction_ids = list(range(len(smiles_rxns)))
    succeeded1, succeeded1_ids, failed1, failed1_ids = runner(
        smiles_rxns,
        reaction_ids,
        canonicalize_smiles_reaction_by_adding_nonexist_atoms_and_bonds,
        nprocs=4,
    )

print("number succeeded:", len(succeeded1))
print("number failed:", len(failed1))

number succeeded: 181
number failed: 20


### Whether can convert to core reactions (second check)

In [9]:
if __name__ == "__main__":
    multiprocessing.set_start_method("fork", force=True)

    succeeded2, succeeded2_ids, failed2, failed2_ids = runner(
        succeeded1, succeeded1_ids, check_convert_to_mp_core_reaction, nprocs=4,
    )

print("number succeeded:", len(succeeded2))
print("number failed:", len(failed2))

number succeeded: 181
number failed: 0


In [10]:
failed = failed1 + failed2
failed_ids = failed1_ids + failed2_ids

succeeded = []
for rxn, i in zip(succeeded2, succeeded2_ids):
    succeeded.append({"smi_rxn": rxn["smi_rxn"], "core_rxn": rxn["core_rxn"], "idx": i})

### write succeeded (with original smi input)

In [11]:
fname = path.parent.joinpath(path.stem + "_succeeded" + path.suffix)
with open(fname, "w") as f:
    f.write("index\toriginal_reaction\tcanonical_reaction\n")
    for x in succeeded:
        i = x["idx"]
        f.write(f"{i}\t{smiles_rxns[i]}\t{x['smi_rxn']}\n")
        
#         # save iamge
#         fname = fname = path.parent.joinpath('image', f'{i}_original' + '.png')
#         plot_smiles_reaction(smiles_rxns[i], fname)
#         fname = fname = path.parent.joinpath('image', f'{i}_edited' + '.png')
#         plot_smiles_reaction(smi, fname)

### write failed (with original smi input)

In [12]:
fname = path.parent.joinpath(path.stem + "_failed" + path.suffix)
with open(fname, "w") as f:
    f.write("index\toriginal_reaction\terror\n")
    for i, error in zip(failed_ids, failed):
        f.write(f"{i}\t{smiles_rxns[i]}\t{error}\n")

### Write dataset for training

In [13]:
def write_dataset_for_training(data, fname):
    with open(fname, "w") as f:
        f.write("reaction\tlabel\traw id\n")
        label = None
        for x in data:
            i = x["idx"]
            smi = x["smi_rxn"]
            f.write(f"{smi}\t{label}\t{i}\n")

In [14]:
def get_species(dataset):
    """Get the species in train/test/val dataset."""
    species = set()
    for x in dataset:
        rxn = x["core_rxn"]
        species.update(rxn.species)

    return sorted(species)


def remove_rare_rxn(dataset, species: List[str]):
    """
    Remove rxn in dataset whose species are not in `species`. Typically `species` 
    is from the training set and `dataset` are test/val set. If some species are 
    not in the species of the training set, there are not so many of them and we 
    remove such reactions.
    """
    species = set(species)
    new_dataset = []
    for x in dataset:
        rxn = x["core_rxn"]
        if set(rxn.species).issubset(species):
            new_dataset.append(x)

    return new_dataset

In [15]:
# generate dataset for training
train_set, val_set, test_set = split_train_val_test(succeeded)
train_set_species = get_species(train_set)
val_set = remove_rare_rxn(val_set, train_set_species)
test_set = remove_rare_rxn(test_set, train_set_species)

tr_fname = path.parent.joinpath(path.stem + "_processed_train.tsv")
write_dataset_for_training(train_set, tr_fname)

val_fname = path.parent.joinpath(path.stem + "_processed_val.tsv")
write_dataset_for_training(val_set, val_fname)

te_fname = path.parent.joinpath(path.stem + "_processed_test.tsv")
write_dataset_for_training(test_set, te_fname)