In [None]:
"""
Example usage of chemrxn-cleaner on ORD data aggregated from a local dataset directory.
"""
from rdkit import RDLogger

# Disable all RDKit logs
RDLogger.DisableLog('rdApp.*')

from __future__ import annotations

import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any, Dict, List, Tuple

from chemrxn_cleaner import clean_reactions
from chemrxn_cleaner import max_smiles_length
from chemrxn_cleaner import load_reactions
from chemrxn_cleaner import ReactionRecord
from chemrxn_cleaner import similarity_filter
from chemrxn_cleaner import export_reaction_records_to_json


DATA_ROOT = Path("/home/pyl/datasets/ord-data/data")


def get_ord_dataset_files(root: Path) -> List[Path]:
    ord_files: List[Path] = []
    for path in root.rglob("*"):
        if not path.is_file():
            continue
        suffix = "".join(path.suffixes).lower()
        if suffix.endswith(".pb") or suffix.endswith(".pb.gz"):
            ord_files.append(path)
    return sorted(ord_files)


dataset_files = get_ord_dataset_files(DATA_ROOT)
print(f"Found {len(dataset_files)} ORD dataset files under {DATA_ROOT}")
dataset_files = dataset_files[0:1]
def load_and_clean_ord_file(path: Path) -> List[ReactionRecord]:
    
    rxn_ord = load_reactions(
        source=str(path),
        fmt="ord",
    )
    # read ORD (Open Reaction Database) data from your local environment
    TEST = "Cn2cc(c1ccccc1)nn2"
    return clean_reactions(rxn_smiles_list=rxn_ord, filters=[
        max_smiles_length(500),
        similarity_filter(query_smiles=TEST, role='any', threshold=0.2)
    ])


rxn_processed: List[ReactionRecord] = []

if dataset_files:
    cpu_workers = os.cpu_count() or 4
    max_workers = max(1, min(len(dataset_files), cpu_workers * 2))
    print(f"Loading data with up to {max_workers} threads...")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(load_and_clean_ord_file, path): path for path in dataset_files}
        for future in as_completed(futures):
            path = futures[future]
            try:
                reactions = future.result()
            except Exception as exc:
                print(f"Failed to load {path}: {exc}")
                continue
            # Load each ORD file concurrently to keep pace with the large dataset.
            rxn_processed.extend(reactions)
else:
    print(f"No ORD dataset files found in {DATA_ROOT}")

print(f"Total ORD reactions processed after the filter: {len(rxn_processed)}")

export_reaction_records_to_json(records=rxn_processed, path='./export.json')





Found 546 ORD dataset files under /home/pyl/datasets/ord-data/data
Loading data with up to 1 threads...
Total ORD reactions processed after the filter: 318


In [None]:
"""
Example usage of chemrxn-cleaner on uspto data aggregated from a local dataset directory.

"""
from rdkit import RDLogger

# Disable all RDKit logs
RDLogger.DisableLog('rdApp.*')

from __future__ import annotations
from chemrxn_cleaner import load_reactions
from chemrxn_cleaner import export_reaction_records_to_json

rxn_list = load_reactions(source="/home/pyl/datasets/uspto/1976_Sep2016_USPTOgrants_smiles.rsmi", fmt="uspto", keep_meta=True)

export_reaction_records_to_json(records=rxn_list[0:10], path='./export.json')

