# Clean the raw USPTO dataset 

The raw dataset can be obtained from https://doi.org/10.6084/m9.figshare.5104873  


In [1]:
import multiprocessing
import numpy as np
import pandas as pd
from typing import Union, Tuple, List

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import AllChem

from rxnrep.dataset.uspto_utils import (
    check_all_reactions_atom_mapped,
    check_all_reactions_bonds_mapped,
    adjust_reagents,
    adjust_atom_map_number,
    get_reaction_bond_change,
    MoleculeCreationError,
    AtomMapNumberError,
    canonicalize_smiles_reaction,
    canonicalize_smiles_reaction_by_adding_nonexist_atoms_and_bonds,
)
from rxnrep.core.molecule import MoleculeError
from rxnrep.core.reaction import smiles_to_reaction, ReactionError
from rxnrep.utils import to_path

RDLogger.logger().setLevel(RDLogger.CRITICAL)  # supress rdkit warnings

In [2]:
def read_file(filename):
    """
    Read smiles reactions and labels. 
    """
    # read smiles reactions
    df = pd.read_csv(filename, sep="\t")
    smiles_rxns = df["ReactionSmiles"].values.tolist()
    # remove the part in f||, like f|0.1.2|
    smiles_rxns = [s.split()[0] for s in smiles_rxns]

    return smiles_rxns

In [3]:
def plot_smiles_reaction(reaction, filename):
    """
    Plot a smiles reaction to file.
    """
    rxn = AllChem.ReactionFromSmarts(reaction, useSmiles=True)
    image = Chem.Draw.ReactionToImage(rxn)
    image.save(filename)

In [4]:
def runner(reactions, func, nprocs=1):
    """
    Run with multiprocess. 
    """
    if nprocs == 1:
        canonical_rxns = [func(rxn) for rxn in reactions]
    else:
        with multiprocessing.Pool(nprocs) as p:
            canonical_rxns = p.map(func, reactions)

    succeeded = []
    failed = []
    for i, (value, error) in enumerate(canonical_rxns):
        if error is not None:
            failed.append((i, error))
        else:
            succeeded.append((i, value))

    return succeeded, failed

### read smiles reaction

In [5]:
# filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles.rsmi"
# filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi"
filename = "/Users/mjwen/Documents/Dataset/uspto/raw/2001_Sep2016_USPTOapplications_smiles_n200.rsmi"
path = to_path(filename)
smiles_rxns = read_file(filename)

### Canonicalize the reactions

In [6]:
# NOTE, python 3.8 multiprocessing default to spawn to create process on macOS.
# this does not work in Jupyter. So set it to `fork`.
if __name__ == "__main__":
    multiprocessing.set_start_method("fork", force=True)
    succeeded, failed = runner(
        smiles_rxns,
        # canonicalize_smiles_reaction,
        canonicalize_smiles_reaction_by_adding_nonexist_atoms_and_bonds,
        nprocs=4,
    )

print("number succeeded:", len(succeeded))
print("number failed", len(failed))

number succeeded: 180
number failed 20


In [7]:
# write succeeded to file

fname = path.parent.joinpath(path.stem + "_succeeded" + path.suffix)
with open(fname, "w") as f:
    f.write("index\toriginal_reaction\tcanonical_reaction\n")
    for i, smi in succeeded:
        f.write(f"{i}\t{smiles_rxns[i]}\t{smi}\n")

#         # save iamge
#         fname = fname = path.parent.joinpath('image', f'{i}_original' + '.png')
#         plot_smiles_reaction(smiles_rxns[i], fname)
#         fname = fname = path.parent.joinpath('image', f'{i}_edited' + '.png')
#         plot_smiles_reaction(smi, fname)

In [8]:
# write failed to file

fname = path.parent.joinpath(path.stem + "_failed" + path.suffix)
with open(fname, "w") as f:
    f.write("index\toriginal_reaction\terror\n")
    for i, error in failed:
        f.write(f"{i}\t{smiles_rxns[i]}\t{error}\n")

### write dataset for training 

In [9]:
def split_train_val_test(data: List, val_ratio=0.1, test_ratio=0.1, random_seed=35):
    """
    Split dataset into training, validation, and test test.
    """
    assert val_ratio + test_ratio < 1.0, "validation + test >= 1"
    size = len(data)
    num_val = int(size * val_ratio)
    num_test = int(size * test_ratio)
    num_train = size - num_val - num_test

    if random_seed is not None:
        np.random.seed(random_seed)

    idx = np.random.permutation(size)
    train_idx = idx[:num_train]
    val_idx = idx[num_train : num_train + num_val]
    test_idx = idx[num_train + num_val :]

    train_set = [data[i] for i in train_idx]
    val_set = [data[i] for i in val_idx]
    test_set = [data[i] for i in test_idx]

    return train_set, val_set, test_set


def write_dataset_for_training(data, fname):
    with open(fname, "w") as f:
        f.write("reaction\tlabel\n")
        for i, smi in data:
            label = None
            f.write(f"{smi}\t{label}\n")

In [10]:
# generate dataset for training
train_set, val_set, test_set = split_train_val_test(succeeded)

tr_fname = path.parent.joinpath(path.stem + "_processed_train.tsv")
write_dataset_for_training(train_set, tr_fname)

val_fname = path.parent.joinpath(path.stem + "_processed_val.tsv")
write_dataset_for_training(val_set, val_fname)

te_fname = path.parent.joinpath(path.stem + "_processed_test.tsv")
write_dataset_for_training(test_set, te_fname)

### Check that smiles can be converted to core reactions 

In [11]:
def convert_one(smi):
    try:
        rxn = smiles_to_reaction(smi, smi, ignore_reagents=True, sanity_check=True)
        return rxn, None
    except (MoleculeError, ReactionError) as e:
        return None, str(e)


def check_reactions(filename):
    df = pd.read_csv(filename, sep="\t")
    smiles_reactions = df["reaction"].tolist()
    succeeded, failed = runner(smiles_reactions, convert_one, nprocs=4)
    for i, e in failed:
        if e is not None:

            # do not show the ones does not balance charge (it happens)
            if "The sum of reactant charges" in e:
                continue

            print(i, e)


if __name__ == "__main__":
    print("\nTraining set:")
    check_reactions(tr_fname)
    print("\nVal set:")
    check_reactions(val_fname)
    print("\nTest set:")
    check_reactions(te_fname)


Training set:

Val set:

Test set:
