In [1]:
import os
import re
import pandas as pd

In [None]:
def smi_tokenizer(smi: str) -> str:
    """
    Tokenize a SMILES molecule or reaction.
    """
    pattern = (
        r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|"    
        r"\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|!|\$|"     
        r"%[0-9]{2}|[0-9])"                               
    )
    regex = re.compile(pattern)
    tokens = regex.findall(smi)
    return ' '.join(tokens)

def load_and_tokenize(file_path: str) -> pd.DataFrame:
    """
    Load a CSV file and tokenize the 'source' and 'target' columns.
    """
    df = pd.read_csv(file_path)
    # Tokenize both source and target SMILES strings.
    df['source'] = df['source'].apply(smi_tokenizer)
    df['target'] = df['target'].apply(smi_tokenizer)
    return df

def export_to_opennmt(df: pd.DataFrame, export_path: str):
    """
    Export the DataFrame (with tokenized SMILES) to separate files for train, test, and validation splits.
    """
    os.makedirs(export_path, exist_ok=True)
    
    # Split the data according to the original 'split' column.
    train = df[df['split'] == 'train']
    test  = df[df['split'] == 'test']
    val   = df[df['split'] == 'validation']
    
    train['source'].to_csv(os.path.join(export_path, 'src-train.txt'), index=False, header=False)
    train['target'].to_csv(os.path.join(export_path, 'tgt-train.txt'), index=False, header=False)
    test['source'].to_csv(os.path.join(export_path, 'src-test.txt'), index=False, header=False)
    test['target'].to_csv(os.path.join(export_path, 'tgt-test.txt'), index=False, header=False)
    val['source'].to_csv(os.path.join(export_path, 'src-val.txt'), index=False, header=False)
    val['target'].to_csv(os.path.join(export_path, 'tgt-val.txt'), index=False, header=False)

In [None]:
# Base directory where the augmented output files are stored (using seed 42 as default; change as needed)
seed = 42
base_dir = f"data/augmented/seed-{seed}"
save_dir = f"data/opennmt/seed-{seed}"

In [11]:
datasets = {
    "c1":               os.path.join(base_dir, f"c1-{seed}.csv"),
    "nc1":              os.path.join(base_dir, f"nc1-{seed}.csv"),
    "r1":               os.path.join(base_dir, f"r1-{seed}.csv"),
    "a2":               os.path.join(base_dir, f"a2-{seed}.csv"),
    "a5":               os.path.join(base_dir, f"a5-{seed}.csv"),
    "a10":              os.path.join(base_dir, f"a10-{seed}.csv"),
    "a20":              os.path.join(base_dir, f"a20-{seed}.csv"),
    "a50":              os.path.join(base_dir, f"a50-{seed}.csv"),
    "npstereo":         os.path.join(base_dir, f"npstereo-{seed}.csv"),
    "ncnpstereo":       os.path.join(base_dir, f"ncnpstereo-{seed}.csv"),
    "rp":               os.path.join(base_dir, f"rp-{seed}.csv"),
    "ncrp":             os.path.join(base_dir, f"ncrp-{seed}.csv"),
    "m65":              os.path.join(base_dir, f"m65-{seed}.csv"),
}

export_dirs = {
    "c1":               os.path.join(save_dir, "c1"),
    "nc1":              os.path.join(save_dir, "nc1"),
    "r1":               os.path.join(save_dir, "r1"),
    "a2":               os.path.join(save_dir, "a2"),
    "a5":               os.path.join(save_dir, "a5"),
    "a10":              os.path.join(save_dir, "a10"),
    "a20":              os.path.join(save_dir, "a20"),
    "a50":              os.path.join(save_dir, "a50"),
    "npstereo":         os.path.join(save_dir, "npstereo"),
    "ncnpstereo":       os.path.join(save_dir, "ncnpstereo"),
    "rp":               os.path.join(save_dir, "rp"),
    "ncrp":             os.path.join(save_dir, "ncrp"),
    "m65":              os.path.join(save_dir, "m65"),
}

In [12]:
# Process each dataset: load, tokenize, and export.
for key, file_path in datasets.items():
    print(f"Processing dataset '{key}' from {file_path}...")
    df = load_and_tokenize(file_path)
    export_to_opennmt(df, export_dirs[key])
    print(f"Exported '{key}' files to {export_dirs[key]}")

Processing dataset 'c1' from data/augmented/seed-42/c1-42.csv...
Exported 'c1' files to data/opennmt/seed-42/c1
Processing dataset 'nc1' from data/augmented/seed-42/nc1-42.csv...
Exported 'nc1' files to data/opennmt/seed-42/nc1
Processing dataset 'r1' from data/augmented/seed-42/r1-42.csv...
Exported 'r1' files to data/opennmt/seed-42/r1
Processing dataset 'a2' from data/augmented/seed-42/a2-42.csv...
Exported 'a2' files to data/opennmt/seed-42/a2
Processing dataset 'a5' from data/augmented/seed-42/a5-42.csv...
Exported 'a5' files to data/opennmt/seed-42/a5
Processing dataset 'a10' from data/augmented/seed-42/a10-42.csv...
Exported 'a10' files to data/opennmt/seed-42/a10
Processing dataset 'a20' from data/augmented/seed-42/a20-42.csv...
Exported 'a20' files to data/opennmt/seed-42/a20
Processing dataset 'a50' from data/augmented/seed-42/a50-42.csv...
Exported 'a50' files to data/opennmt/seed-42/a50
Processing dataset 'npstereo' from data/augmented/seed-42/npstereo-42.csv...
Exported 'n