# Clean the NREL BDE dataset 

https://chemrxiv.org/articles/Prediction_of_Homolytic_Bond_Dissociation_Enthalpies_for_Organic_Molecules_at_near_Chemical_Accuracy_with_Sub-Second_Computational_Cost/10052048/2

- add atom map number to the one bond break reactions

In [1]:
import json
import multiprocessing
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
from rxnrep.core.molecule import MoleculeError
from rxnrep.core.reaction import ReactionError, smiles_to_reaction
from rxnrep.data.grapher import (
    get_atom_distance_to_reaction_center,
    get_bond_distance_to_reaction_center,
)
from rxnrep.core.utils import generate_atom_map_number_one_bond_break_reaction
from rxnrep.utils import to_path

RDLogger.logger().setLevel(RDLogger.CRITICAL)  # supress rdkit warnings

Using backend: pytorch


In [2]:
def split_train_val_test(data: List, val_ratio=0.1, test_ratio=0.1, random_seed=35):
    """
    Randomly split dataset into training, validation, and test test.
    """
    assert val_ratio + test_ratio < 1.0, "validation + test >= 1"
    size = len(data)
    num_val = int(size * val_ratio)
    num_test = int(size * test_ratio)
    num_train = size - num_val - num_test

    if random_seed is not None:
        np.random.seed(random_seed)

    idx = np.random.permutation(size)
    train_idx = idx[:num_train]
    val_idx = idx[num_train : num_train + num_val]
    test_idx = idx[num_train + num_val :]

    train_set = [data[i] for i in train_idx]
    val_set = [data[i] for i in val_idx]
    test_set = [data[i] for i in test_idx]

    return train_set, val_set, test_set

In [3]:
def check_convert_to_mp_core_reaction(smi_rxn):
    """
    Check whether a smile reaction can be converted to core reaction.
    """

    try:
        rxn = smiles_to_reaction(
            smi_rxn,
            id=smi_rxn,
            ignore_reagents=True,
            remove_H=False,
            sanity_check=False,
        )

        rxn.check_charge()
        rxn.check_composition()
        rxn.check_atom_map_number()

    except (MoleculeError, ReactionError) as e:
        return None, "Fail converting to core reaction: " + str(e)

    # check we can get labels
    try:
        get_atom_distance_to_reaction_center(rxn)
        get_bond_distance_to_reaction_center(rxn)
    except AssertionError as e:
        return None, "Fail converting to core reaction: " + str(e)

    return {"smi_rxn": smi_rxn, "core_rxn": rxn}, None

In [4]:
def runner(reactions, reaction_ids, func, nprocs=1):
    """
    Run with multiprocess.
    """
    if nprocs == 1:
        processed_rxns = [func(rxn) for rxn in reactions]
    else:
        with multiprocessing.Pool(nprocs) as p:
            processed_rxns = p.map(func, reactions)

    succeeded = []
    succeeded_ids = []
    failed = []
    failed_ids = []
    for i, (value, error) in enumerate(processed_rxns):
        iid = reaction_ids[i]

        if error is not None:
            failed.append(error)
            failed_ids.append(iid)
        else:
            succeeded.append(value)
            succeeded_ids.append(iid)

    return succeeded, succeeded_ids, failed, failed_ids

### read smiles reaction

In [5]:
def read_file(filename):
    """
    Read smiles reactions, activation energy and reaction enthalpy.
    """

    # read smiles reactions
    df = pd.read_csv(filename, header=0, index_col=None)

    # remove duplicate reactions (i.e. with the same reactants and products)
    unique_rxns = []
    pd_serials = []
    for idx, row in df.iterrows():
        rxn = (row["molecule"], tuple(sorted([row["fragment1"], row["fragment2"]])))
        if rxn not in unique_rxns:
            unique_rxns.append(rxn)
            pd_serials.append(row)
    df = pd.DataFrame(pd_serials)

    df["reaction"] = df["molecule"] + ">>" + df["fragment1"] + "." + df["fragment2"]

    reactions = df["reaction"].to_list()
    bdes = df["bde"].to_list()

    return reactions, bdes

In [6]:
filename = "/Users/mjwen/Documents/Dataset/NREL_BDE/rdf_data_190531_n200.csv"
# filename = "/Users/mjwen/Documents/Dataset/NREL_BDE/rdf_data_190531.csv"

path = to_path(filename)
smiles_rxns, bdes = read_file(filename)
reaction_ids = list(range(len(smiles_rxns)))

smiles_rxns[0]

'NCCCC(=O)O>>[CH2]CCC(=O)O.[NH2]'

### generate atom map number

In [7]:
def gen_atom_mapping(smiles_rxn):
    try:
        s = generate_atom_map_number_one_bond_break_reaction(smiles_rxn, add_H=True)
        error = None
    except Exception as e:
        s = None
        error = str(e)
    return s, error


if __name__ == "__main__":
    multiprocessing.set_start_method("fork", force=True)

    succeeded0, succeeded0_ids, failed0, failed0_ids = runner(
        smiles_rxns,
        reaction_ids,
        gen_atom_mapping,
        nprocs=4,
    )

print("number succeeded:", len(succeeded0))
print("number failed:", len(failed0))

number succeeded: 128
number failed: 0


### check reactions by convering to core Reaction

In [8]:
if __name__ == "__main__":
    multiprocessing.set_start_method("fork", force=True)

    succeeded1, succeeded1_ids, failed1, failed1_ids = runner(
        succeeded0,
        succeeded0_ids,
        check_convert_to_mp_core_reaction,
        nprocs=4,
    )

print("number succeeded:", len(succeeded1))
print("number failed:", len(failed1))

number succeeded: 127
number failed: 1


In [9]:
failed = failed0 + failed1
failed_ids = failed0_ids + failed1_ids

succeeded = []
for rxn, i in zip(succeeded1, succeeded1_ids):
    succeeded.append({"smi_rxn": rxn["smi_rxn"], "core_rxn": rxn["core_rxn"], "idx": i})

### write succeeded (with original smi input)

In [10]:
fname = path.parent.joinpath(path.stem + "_succeeded" + path.suffix)
with open(fname, "w") as f:
    f.write("index\toriginal_reaction\tcanonical_reaction\n")
    for x in succeeded:
        i = x["idx"]
        f.write(f"{i}\t{smiles_rxns[i]}\t{x['smi_rxn']}\n")

### write failed (with original smi input)

In [11]:
fname = path.parent.joinpath(path.stem + "_failed" + path.suffix)
with open(fname, "w") as f:
    f.write("index\toriginal_reaction\terror\n")
    for i, error in zip(failed_ids, failed):
        f.write(f"{i}\t{smiles_rxns[i]}\t{error}\n")

### write dataset for training 

In [12]:
def write_dataset_for_training(
    data: List, reaction_energy: List, fname
):
    """Write dataset to tsv."""
    smiles = []
    raw_id = []
    energy = []
    for x in data:
        i = x["idx"]
        raw_id.append(i)
        smiles.append(x["smi_rxn"])
        energy.append(reaction_energy[i])

    df = pd.DataFrame(
        {
            "reaction": smiles,
            "reaction energy": energy,
            "raw id": raw_id,
        }
    )
    df.to_csv(fname, index=False, sep="\t")

In [13]:
# generate dataset for training

train_set, val_set, test_set = split_train_val_test(succeeded)

tr_fname = path.parent.joinpath(path.stem + "_processed_train.tsv")
write_dataset_for_training(train_set,bdes, tr_fname)

val_fname = path.parent.joinpath(path.stem + "_processed_val.tsv")
write_dataset_for_training(val_set, bdes, val_fname)

te_fname = path.parent.joinpath(path.stem + "_processed_test.tsv")
write_dataset_for_training(test_set, bdes, te_fname)