In [1]:
print("LKJ")

LKJ


In [2]:
import re
import pandas as pd
from rdkit import Chem
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

In [3]:
USPTO_MIXED_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/MIT_mixed"
USPTO_SEP_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/MIT_separated"
USPTO_50_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/uspto_50"

In [4]:
USPTO_MIXED_PICKLE_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/uspto_mixed.pickle"
USPTO_SEP_PICKLE_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/uspto_sep.pickle"
USPTO_50_PICKLE_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/uspto_50.pickle"

In [5]:
# **********************
# *** Util Functions ***
# **********************

In [6]:
def remove_whitespace(path):
    text = path.read_text()
    lines = [line for line in text.split("\n") if line is not None and line != ""]
    mol_strs = [line.replace(" ", "") for line in lines]
    return mol_strs

In [7]:
def create_mols(mol_strs):
    executor = ThreadPoolExecutor()
    futures = [executor.submit(Chem.MolFromSmiles, mol_str) for mol_str in mol_strs]
    mols = [future.result() for future in futures]
    err_strs = [mol_strs[i] for i, mol in enumerate(mols) if mol is None]

    if len(err_strs) > 0:
        print("Could not construct mols for the following strings:")
        for err_str in err_strs:
            print(err_str)

    return mols

In [8]:
def create_mols_sep(mol_strs):
    def process_mol_str(mol_str):
        splits = mol_str.split(">")
        if len(splits) != 2:
            raise ValueError(f"Error with mol str: {mol_str}")

        reactants, reagents = tuple(splits)
        react_mol = Chem.MolFromSmiles(reactants)
        reag_mol = Chem.MolFromSmiles(reagents)
        return react_mol, reag_mol

    executor = ThreadPoolExecutor()
    futures = [executor.submit(process_mol_str, mol_str) for mol_str in mol_strs]
    mols = [future.result() for future in futures]
    err_strs = [mol_strs[i] for i, (react, reag) in enumerate(mols) if react is None or reag is None]

    if len(err_strs) > 0:
        print("Could not construct mols for the following strings:")
        for err_str in err_strs:
            print(err_str)

    react_mols, reag_mols = tuple(zip(*mols))
    return react_mols, reag_mols

In [9]:
def load_mols(path, sep=False, uspto_50=False):
    mol_strs = remove_whitespace(path)

    if uspto_50:
        new_mol_strs = []
        reaction_types = []
        prog = re.compile("(<RX_6>|<RX_2>|<RX_1>|<RX_3>|<RX_7>|<RX_9>|<RX_5>|<RX_10>|<RX_4>|<RX_8>)")

        for mol_str in mol_strs:
            new_str = prog.sub("", mol_str)
            reaction_type = prog.match(mol_str)
            reaction_type = reaction_type[0] if reaction_type is not None else None
            new_mol_strs.append(new_str)
            reaction_types.append(reaction_type)

        mol_strs = new_mol_strs

    if sep:
        react_mols, reag_mols = create_mols_sep(mol_strs)
        return react_mols, reag_mols

    mols = create_mols(mol_strs)
    if uspto_50:
        return mols, reaction_types

    return mols

In [10]:
def build_df(reacts, prods, set_name, reags=None, reaction_types=None):
    data = {"reactants_mol": reacts, "products_mol": prods}
    if reags is not None:
        data["reagents_mol"] = reags
    if reaction_types is not None:
        data["reaction_type"] = reaction_types

    df = pd.DataFrame(data=data)
    df["set"] = set_name
    return df

In [11]:
# ************************
# *** Process Datasets ***
# ************************

In [12]:
def process_uspto_mixed_dataset(path):
    train_reacts = load_mols(path / "src-train.txt")
    train_prods = load_mols(path / "tgt-train.txt")
    train_df = build_df(train_reacts, train_prods, "train")

    val_reacts = load_mols(path / "src-val.txt")
    val_prods = load_mols(path / "tgt-val.txt")
    val_df = build_df(val_reacts, val_prods, "valid")

    test_reacts = load_mols(path / "src-test.txt")
    test_prods = load_mols(path / "tgt-test.txt")
    test_df = build_df(test_reacts, test_prods, "test")

    dataset_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
    return dataset_df

In [13]:
# Process USPTO Mixed dataset
uspto_mixed_path = Path(USPTO_MIXED_PATH)
uspto_mixed_df = process_uspto_mixed_dataset(uspto_mixed_path)
print(f"Read {str(len(uspto_mixed_df.index))} rows from USPTO Mixed dataset.")

FileNotFoundError: [Errno 2] No such file or directory: '/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/MIT_mixed/src-train.txt'

In [None]:
uspto_mixed_df.head()

In [14]:
uspto_mixed_df.to_pickle(Path(USPTO_MIXED_PICKLE_PATH))

NameError: name 'uspto_mixed_df' is not defined

In [None]:
def process_uspto_sep_dataset(path):
    train_reacts, train_reags = load_mols(path / "src-train.txt", sep=True)
    train_prods = load_mols(path / "tgt-train.txt")
    train_df = build_df(train_reacts, train_prods, "train", reags=train_reags)

    val_reacts, val_reags = load_mols(path / "src-val.txt", sep=True)
    val_prods = load_mols(path / "tgt-val.txt")
    val_df = build_df(val_reacts, val_prods, "valid", reags=val_reags)

    test_reacts, test_reags = load_mols(path / "src-test.txt", sep=True)
    test_prods = load_mols(path / "tgt-test.txt")
    test_df = build_df(test_reacts, test_prods, "test", reags=test_reags)

    dataset_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
    return dataset_df

In [None]:
# Process USPTO Separated dataset
uspto_sep_path = Path(USPTO_SEP_PATH)
uspto_sep_df = process_uspto_sep_dataset(uspto_sep_path)
print(f"Read {str(len(uspto_sep_df.index))} rows from USPTO Separated dataset.")

In [None]:
uspto_sep_df.head()

In [None]:
uspto_sep_df.to_pickle(Path(USPTO_SEP_PICKLE_PATH))

In [None]:
def process_uspto_50_dataset(path):
    train_prods, train_types = load_mols(path / "src-train.txt", uspto_50=True)
    train_reacts, _ = load_mols(path / "tgt-train.txt", uspto_50=True)
    assert len(train_types) == len(train_reacts) == len(train_prods)
    train_df = build_df(train_reacts, train_prods, "train", reaction_types=train_types)

    val_prods, val_types = load_mols(path / "src-val.txt", uspto_50=True)
    val_reacts, _ = load_mols(path / "tgt-val.txt", uspto_50=True)
    assert len(val_types) == len(val_reacts) == len(val_prods)
    val_df = build_df(val_reacts, val_prods, "valid", reaction_types=val_types)

    test_prods, test_types = load_mols(path / "src-test.txt", uspto_50=True)
    test_reacts, _ = load_mols(path / "tgt-test.txt", uspto_50=True)
    assert len(test_types) == len(test_reacts) == len(test_prods)
    test_df = build_df(test_reacts, test_prods, "test", reaction_types=test_types)

    dataset_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
    return dataset_df

In [None]:
# Process USPTO 50K dataset
uspto_50_path = Path(USPTO_50_PATH)
uspto_50_df = process_uspto_50_dataset(uspto_50_path)
print(f"Read {str(len(uspto_50_df.index))} rows from USPTO 50K dataset.")

In [None]:
uspto_50_df.head()

In [None]:
uspto_50_df.to_pickle(Path(USPTO_50_PICKLE_PATH))

In [4]:
# Generate a text file for USPTO 50K test data for predict.py script

In [15]:
USPTO_50_SAVED_PICKLE_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/uspto_50.pickle"
USPTO_50_TEST_TEXT_PATH = "/projects/cc/knlr326/1_NMR_project/1_NMR_data_AZ/1_old_models/Chemformer/data/seq-to-seq_datasets/uspto_50_test.txt"

In [16]:
uspto_50_df = pd.read_pickle(USPTO_50_SAVED_PICKLE_PATH)

In [17]:
uspto_50_df.head()

Unnamed: 0,reactants_mol,products_mol,reaction_type,set
0,<rdkit.Chem.rdchem.Mol object at 0x7f2a0c3bb970>,<rdkit.Chem.rdchem.Mol object at 0x7f2a0ab304b0>,<RX_1>,train
1,<rdkit.Chem.rdchem.Mol object at 0x7f2a0fd174f0>,<rdkit.Chem.rdchem.Mol object at 0x7f2a0ab30530>,<RX_6>,train
2,<rdkit.Chem.rdchem.Mol object at 0x7f2a0c3c6030>,<rdkit.Chem.rdchem.Mol object at 0x7f2a0ab305b0>,<RX_9>,train
3,<rdkit.Chem.rdchem.Mol object at 0x7f2a0c3c6070>,<rdkit.Chem.rdchem.Mol object at 0x7f2a0ab30630>,<RX_6>,train
4,<rdkit.Chem.rdchem.Mol object at 0x7f2a0c3c60b0>,<rdkit.Chem.rdchem.Mol object at 0x7f2a0ab306b0>,<RX_1>,train


In [18]:
reacts_mol = uspto_50_df["reactants_mol"].tolist()
sets = uspto_50_df["set"].tolist()

In [19]:
reacts_mol_test = [mol for idx, mol in enumerate(reacts_mol) if sets[idx] == "test"]

In [20]:
reacts_test = [Chem.MolToSmiles(mol) for mol in reacts_mol_test]
print(f"Length of test dataset: {len(reacts_test)}")

Length of test dataset: 5004


In [21]:
output_str = "\n".join(reacts_test)

In [22]:
p = Path(USPTO_50_TEST_TEXT_PATH)
p.write_text(output_str)

259311