In [None]:
"""
Example usage of chemrxn-cleaner on ORD data aggregated from a local dataset directory.

Source: https://github.com/open-reaction-database/ord-data/tree/main/data

"""
from __future__ import annotations

from pathlib import Path
from typing import List, Tuple

from rdkit import RDLogger

# Disable all RDKit logs
RDLogger.DisableLog("rdApp.*")

from chemrxn_cleaner import (
    CleaningStats,
    ReactionRecord,
    ElementFilterRule,
    clean_reactions,
    clean_reactions_with_report,
    export_reaction_records_to_json,
    load_reactions,
    max_smiles_length,
    similarity_filter,
    element_filter,
    has_product
)


def load_and_clean_ord_file(path: Path) -> Tuple[List[ReactionRecord], CleaningStats]:
    rxn_ord = load_reactions(
        source=str(path),
        fmt="ord",
    )
    # read ORD (Open Reaction Database) data from your local environment
    TEST = "Cn2cc(c1ccccc1)nn2"
    return clean_reactions_with_report(
        rxn_smiles_list=rxn_ord,
        filters=[
            max_smiles_length(500),
            has_product,
            element_filter(
                allowList=ElementFilterRule(["C", "H", "O", "Cl", "F", "N"], [], []),
                forbidList=ElementFilterRule([], [], []),
            ),
            similarity_filter(query_smiles=TEST, role="any", threshold=0.2),
        ],
    )



rxn_processed, stats = load_and_clean_ord_file('../tests/resources/ord_dataset-sample.pb.gz')

print(f"Reaction cleaning stats: {stats}")

print(f"Total ORD reactions processed after the filter: {len(rxn_processed)}")
export_reaction_records_to_json(records=rxn_processed, path="./export.json")


In [None]:
"""
Example usage of chemrxn-cleaner on uspto data aggregated from a local dataset directory.

Source: https://figshare.com/articles/dataset/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873

"""
from __future__ import annotations

from chemrxn_cleaner import (
    ReactionRecord,
    clean_reactions,
    load_reactions,
)

from rdkit import RDLogger

# Disable all RDKit logs
RDLogger.DisableLog("rdApp.*")


USPTO_SOURCE = "../tests/resources/uspto_dataset-sample-small.rsmi"

rxn_list = load_reactions(
    source=USPTO_SOURCE,
    fmt="uspto",
    keep_meta=True,
)
print(f"Loaded {len(rxn_list)} raw reactions from {USPTO_SOURCE}")

rxn_filtered = clean_reactions(rxn_smiles_list=rxn_list)

In [None]:
"""
Example usage of chemrxn-cleaner on json data aggregated from a local dataset directory.

Source data: https://github.com/msaebi1993/yield-rxn/tree/master/data/dy/raw

"""
from typing import Any, Dict

from chemrxn_cleaner import load_reactions
from chemrxn_cleaner.parser import parse_reaction_smiles

SOURCE_PATH = "../tests/resources/json_dataset-sample.json"



def to_reaction_record(entry: Dict[str, Any]) -> ReactionRecord:
    """Map raw JSON dicts into ReactionRecord instances."""
    if "reaction_smiles" in entry:
        rec = ReactionRecord.from_dict(entry)
    else:
        reactants = [str(s["smiles"]) for s in entry.get("reactants", [])]
        reagents = [entry["base"]["smiles"]]
        products = [entry["product"]["smiles"]]
        rxn_smiles = ">".join(
            [".".join(reactants), ".".join(reagents), ".".join(products)]
        )
        rec = parse_reaction_smiles(rxn_smiles, strict=False)
        rec.reactants = reactants
        rec.reagents = reagents
        rec.products = products
        rec.bases = reagents
    rec.source = entry.get("source", "json") or "json"
    rec.reaction_id = entry.get("reaction_id", entry.get("id", ""))
    rec.extra_metadata.update(entry.get("meta", {}))
    return rec


rxn_records = load_reactions(source=SOURCE_PATH, fmt="json", mapper=to_reaction_record)
print(f"Parsed {len(rxn_records)} ReactionRecord objects")
print(rxn_records[0].to_dict())
rxn_records[0].show()

In [None]:
"""
Example usage of chemrxn-cleaner on csv data aggregated from a local dataset directory.

Source data: 

    * https://ibm.ent.box.com/v/ReactionSeq2SeqDataset (Ref: https://github.com/pschwllr/MolecularTransformer?tab=readme-ov-file)



"""
from typing import Optional

from chemrxn_cleaner import load_reactions

USPTO_STEREO_PATH = "../tests/resources/csv_dataset-sample.csv"


def _to_float(value: Any) -> Optional[float]:
    try:
        text = str(value).replace("%", "").strip()
        if not text:
            return None
        return float(text)
    except Exception:
        return None



def map_row(record: ReactionRecord, row: Dict[str, Any]) -> ReactionRecord:
    record.source = "uspto_stereo"
    record.source_ref = row.get("PatentNumber") or None
    if row.get("CalculatedYield"):
        record.yield_value = _to_float(row.get("CalculatedYield"))
    return record


uspto_stereo = load_reactions(
    source=USPTO_STEREO_PATH,
    fmt="csv",
    reactant_columns=[],
    reagent_columns=[],
    product_columns=[],
    reaction_smiles_column="OriginalReaction",
    delimiter="\t",
    skip_lines=2,
    mapper=map_row,
)
print(f"Parsed {len(uspto_stereo)} ReactionRecord objects from uspto stereo")
print(uspto_stereo[0].to_dict())
uspto_stereo[0].show()