# Clean the USPTO dataset processed by Jin

The dataset is used in the two papers: 
- Predicting Organic Reaction Outcomes with Weisfeiler-Lehman Network
- A graph-convolutional neural network model for the prediction of chemical reactivity

They can be obtained from:
- https://github.com/wengong-jin/nips17-rexgen/tree/master/USPTO
- https://github.com/connorcoley/rexgen_direct/tree/master/rexgen_direct


In [1]:
import multiprocessing
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rxnrep.dataset.rdkit_utils import (
    check_all_reactions_atom_mapped, check_all_reactions_bonds_mapped,
    adjust_reagents, adjust_atom_map_number, get_reaction_bond_change,
    set_all_H_to_explicit, edit_molecule, MoleculeCreationError,
    AtomMapNumberError, get_atom_property_as_dict)
from rxnrep.utils import to_path
from rdkit import RDLogger
from rdkit.Chem import KekulizeException, AtomValenceException, AtomKekulizeException
from typing import Union, Tuple, List

RDLogger.logger().setLevel(RDLogger.CRITICAL)  # supress rdkit warnings

In [2]:
def read_file(filename):
    """
    Read smiles reactions and labels. 
    """
    # read smiles reactions
    smiles_rxns = []
    labels = []
    with open(to_path(filename), 'r') as f:
        for line in f:
            rxn, lb = line.strip().split()
            smiles_rxns.append(rxn)
            labels.append(lb)

    return smiles_rxns

In [3]:
def plot_smiles_reaction(reaction, filename):
    """
    Plot a smiles reaction to file.
    """
    rxn = AllChem.ReactionFromSmarts(reaction, useSmiles=True)
    image = Chem.Draw.ReactionToImage(rxn)
    image.save(filename)

In [4]:
def get_bond_change(
        reaction: str) -> Tuple[Union[str, None], Union[None, str]]:
    """
    Bet bond changes of a reaction.
    """

    # Step 1, adjust reagents
    try:
        rxn_smi = adjust_reagents(reaction)
    except (MoleculeCreationError, AtomMapNumberError) as e:
        return None, str(e).rstrip()

    # Step 2, adjust atom mapping
    try:
        reactants_smi, reagents_smi, products_smi = rxn_smi.strip().split(">")
        reactants = Chem.MolFromSmiles(reactants_smi)
        products = Chem.MolFromSmiles(products_smi)
        reactants, products = adjust_atom_map_number(reactants, products)
    except AtomMapNumberError as e:
        return None, str(e).rstrip()

    # Step 3, get bond changes
    bond_changes = get_reaction_bond_change(reactants,
                                            products,
                                            use_mapped_atom_index=True)
    bond_changes = ";".join(
        ['-'.join([str(i) for i in x]) for x in bond_changes])

    rxn = '>'.join([
        Chem.MolToSmiles(reactants), reagents_smi,
        Chem.MolToSmiles(products)
    ])

    return (rxn, bond_changes), None

In [5]:
def canonicalize_smiles_reaction(
        reaction: str) -> Tuple[Union[str, None], Union[None, str]]:
    """
    Canonicalize a smiles reaction to make reactants and products have the same
    composition.

    This ensures the reactants and products have the same composition, achieved in the
    below steps:

    1. remove reactant molecules from reactants none of their atoms are present in
       the products
    2. adjust atom mapping between reactants and products and add atom mapping number
       for reactant atoms without a mapping number (although there is no corresponding
       atom in the products)
    3. create new products by editing the reactants: removing bonds in the reactants
       but not in the products and adding bonds not in the reactants but in the products

    Args:
        reaction: smiles representation of a reaction

    Returns:
        reaction: canonicalized smiles reaction, `None` if canonicalize failed
        error: error message, `None` if canonicalize succeed
    """

    # Step 1, adjust reagents
    try:
        rxn_smi = adjust_reagents(reaction)
    except (MoleculeCreationError, AtomMapNumberError) as e:
        return None, str(e).rstrip()

    # Step 2, adjust atom mapping
    try:
        reactants_smi, reagents_smi, products_smi = rxn_smi.strip().split(">")
        reactants = Chem.MolFromSmiles(reactants_smi)
        products = Chem.MolFromSmiles(products_smi)
        reactants, products = adjust_atom_map_number(reactants, products)
    except AtomMapNumberError as e:
        return None, str(e).rstrip()

    # Step 3, create new products
    try:
        bond_changes, has_lost, has_added, _ = get_reaction_bond_change(
            reactants, products)
        # skip reactions only has bond type changes, but no bond lost or added
        if not (has_lost or has_added):
            return None, "reactions with only bond type changes"
        products_atom_prop = get_atom_property_as_dict(products)
        new_products = edit_molecule(reactants, bond_changes,
                                     products_atom_prop)


#    except (KekulizeException, AtomValenceException,
#            AtomKekulizeException) as e:
    except (KekulizeException, AtomKekulizeException) as e:
        return None, str(e).rstrip()

    # write canonicalized reaction to smiles
    reactants_smi = Chem.MolToSmiles(set_all_H_to_explicit(reactants))
    products_smi = Chem.MolToSmiles(set_all_H_to_explicit(new_products))
    canoical_reaction = ">".join([reactants_smi, reagents_smi, products_smi])

    return canoical_reaction, None

In [6]:
def runner(reactions, func, nprocs=1):
    if nprocs == 1:
        canonical_rxns = [func(rxn) for rxn in reactions]
    else:
        with multiprocessing.Pool(nprocs) as p:
            canonical_rxns = p.map(func, reactions)

    succeeded = []
    failed = []
    for i, (value, error) in enumerate(canonical_rxns):
        if error is not None:
            failed.append((i, error))
        else:
            succeeded.append((i, value))

    return succeeded, failed

### read smiles reaction

In [7]:
# filename = "/Users/mjwen/Documents/Dataset/uspto/Jin/test.txt"
filename = "/Users/mjwen/Documents/Dataset/uspto/Jin/test_n200.txt"
path = to_path(filename)
smiles_rxns = read_file(filename)

### Canonicalize the reactions

In [8]:
# NOTE, python 3.8 multiprocessing default to spawn to create process on macOS.
# this does not work in Jupyter. So set it to `fork`.
if __name__ == '__main__':
    multiprocessing.set_start_method('fork', force=True)
    succeeded, failed = runner(smiles_rxns,
                               canonicalize_smiles_reaction,
                               nprocs=4)

In [9]:
# write succeeded to file

fname = path.parent.joinpath(path.stem + '_succeeded' + path.suffix)
with open(fname, 'w') as f:
    f.write('index\toriginal_reaction\tcanonical_reaction\n')
    for i, smi in succeeded:
        f.write(f'{i}\t{smiles_rxns[i]}\t{smi}\n')

#         # save iamge
#         fname = fname = path.parent.joinpath('image', f'{i}_original' + '.png')
#         plot_smiles_reaction(smiles_rxns[i], fname)
#         fname = fname = path.parent.joinpath('image', f'{i}_edited' + '.png')
#         plot_smiles_reaction(smi, fname)

In [10]:
# write failed to file

fname = path.parent.joinpath(path.stem + '_failed' + path.suffix)
with open(fname, 'w') as f:
    f.write('index\toriginal_reaction\terror\n')
    for i, error in failed:
        f.write(f'{i}\t{smiles_rxns[i]}\t{error}\n')

### Write dataset for training

In [11]:
def split_train_val_test(data: List,
                         val_ratio=0.1,
                         test_ratio=0.1,
                         random_seed=35):
    """
    Split dataset into training, validation, and test test.
    """
    assert val_ratio + test_ratio < 1.0, "validation + test >= 1"
    size = len(data)
    num_val = int(size * val_ratio)
    num_test = int(size * test_ratio)
    num_train = size - num_val - num_test

    if random_seed is not None:
        np.random.seed(random_seed)

    idx = np.random.permutation(size)
    train_idx = idx[:num_train]
    val_idx = idx[num_train:num_train + num_val]
    test_idx = idx[num_train + num_val:]

    train_set = [data[i] for i in train_idx]
    val_set = [data[i] for i in val_idx]
    test_set = [data[i] for i in test_idx]

    return train_set, val_set, test_set


def write_dataset_for_training(data, fname):
    with open(fname, 'w') as f:
        f.write('reaction\tlabel\n')
        for i, smi in data:
            label = 'to be filled'
            f.write(f'{smi}\t{label}\n')

In [12]:
# generate dataset for training
train_set, val_set, test_set = split_train_val_test(succeeded)

fname = path.parent.joinpath(path.stem + '_processed_train.tsv')
write_dataset_for_training(train_set, fname)

fname = path.parent.joinpath(path.stem + '_processed_val.tsv')
write_dataset_for_training(train_set, fname)

fname = path.parent.joinpath(path.stem + '_processed_test.tsv')
write_dataset_for_training(train_set, fname)

### Get changed bonds

In [13]:
# succeeded, failed = runner(smiles_rxns, get_bond_change, nprocs=4)

# # write succeeded to file

# fname = path.parent.joinpath(path.stem + '_succeeded_changes' + path.suffix)
# with open(fname, 'w') as f:
#     f.write('index\toriginal_reaction\tcanonical_reaction\tbond_change\n')
#     for i, (rxn, bond_change) in succeeded:
#         f.write(f'{i}\t{smiles_rxns[i]}\t{rxn}\t{bond_change}\n')

### check reactions atom mapping

In [14]:
mapped = check_all_reactions_atom_mapped(smiles_rxns,
                                         nprocs=6,
                                         print_result=True)

Reactants and products in all reactions are mapped.
Done!


In [15]:
mapped = check_all_reactions_bonds_mapped(smiles_rxns,
                                          nprocs=6,
                                          print_result=True)

Total number of reactions: 200; reactions having bonds both atoms not mapped: 0; having bonds one atom not mapped  0; molecules cannot be converted: 0.
Done!
