In [2]:
from synrfp import synrfp
from typing import Optional, List

def fps(
    rxn: str,
    tokenizer: Optional[str] = "wl",
    radius: Optional[int] = 2,
    sketch: Optional[str] = "parity",
    mode: Optional[str] = "delta",
    bits: Optional[int] = 2048,
    seed: Optional[int] = 42,
    node_attrs: Optional[List[str]] = ['element', 'hcount', 'aromatic', "in_ring", "isomer", "hybridization"],
    edge_attrs: Optional[List[str]] = ['order', "ez_isomer", "conjugated", "in_ring"],
):
    """
    Compute a SynRFP fingerprint for a reaction SMILES.

    This helper is a thin wrapper around your `synrfp(...)` API.  Any argument
    left as `None` will fall back to the library/default behaviour.

    -----------------------------------------------------------------------
    Common (recommended) choices you can pass for the configurable arguments
    -----------------------------------------------------------------------
    tokenizer (str)
        - "wl"     : Weisfeiler–Lehman subtree labels (graph-native, default)
        - "nauty"  : Nauty/Traces canonical ego-subgraphs (canonical graphs)
        - "morgan" : Morgan / circular (ECFP-like) tokeniser (RDKit)
        - "path"   : Path-based tokenizer
        - other tokenizer class-names supported by your synrfp installation

    radius (int)
        - Typical values: 0, 1, 2, 3, 4
        - Recommended: 2 (good trade-off), 3 (more expressive, may overfit)
        - Must be a non-negative integer.

    sketch (str)
        - "parity"  : ParityFold / XOR-fold binary sketch (fast, good for ML)
        - "minhash" : MinHash sketch (Jaccard similarity friendly)
        - "cws"     : CWSketch (weighted minhash / weighted Jaccard)
        - "srp"     : SRPSketch / signed random projection (for cosine-like)
        - other sketch names supported by your synrfp installation

    mode (str)
        - "delta" : signed difference (P - R). Direction-aware; default for many tasks.
        - "union" : union/presence mode (R + P counts treated as positive weights).

    bits (int)
        - Common sizes: 64, 128, 256, 512, 1024, 2048, 4096
        - Choose based on collision tolerance / model capacity (1024 is a common sweet spot).

    seed (int)
        - Determinism seed for hashing/sketching. Typical default: 42.

    node_attrs (list[str] | None)
        - Typical node attributes: ["element", "hcount", "aromatic", "in_ring",
                                   "isomer", "hybridization", "formal_charge"]
        - If None, the tokenizer's defaults are used.

    edge_attrs (list[str] | None)
        - Typical edge attributes: ["order", "ez_isomer", "conjugated", "in_ring", "stereo"]
        - If None, the tokenizer's defaults are used.

    -----------------------------------------------------------------------
    Behaviour
    -----------------------------------------------------------------------
    - Any argument explicitly set to `None` will use library defaults.
    - The function returns the fingerprint object exactly as produced by
      `synrfp(...)` (commonly a list of 0/1 bits for parity sketches).
    - Use `mode="delta"` for direction-sensitive tasks (mechanistic classification);
      use `mode="union"` when you want presence-only features.
    - Increasing `radius` or `bits` increases expressiveness and memory / collision
      resistance respectively — but may reduce generalization if set too high.

    Example
    -------
    >>> # use defaults for everything except rxn string
    >>> fp = fps("CCO.O>>CC(=O)O")
    >>> # specify a few options
    >>> fp = fps("CCO>>CCO", tokenizer="morgan", radius=2, sketch="parity", bits=2048)

    -----------------------------------------------------------------------
    Notes
    -----------------------------------------------------------------------
    - For a programmatic list of available tokenizers/sketchers (on your system)
      you can probe the synrfp package or consult your environment. Typical
      installations include at least "wl" and "parity".
    - If you want metadata (delta counts, raw tokens) returned alongside the
      fingerprint, call the lower-level `SynRFP`/`SynRFPResult` APIs directly.
    """
    return synrfp(
        rxn,
        tokenizer=tokenizer,
        radius=radius if radius is not None else 2,
        sketch=sketch,
        mode=mode,
        bits=bits if bits is not None else 1024,
        seed=seed if seed is not None else 42,
        node_attrs=node_attrs,
        edge_attrs=edge_attrs,
    )

In [None]:
'''
Có mấy này cần chạy nè
radius: 1,2,3
Bits: 512, 1024, 2028
2 cái trên pairwise với nhau đi cho đỡ chạy nhiều
tokenizer: wl, path, morgan
Sketch: parity, minhash, cw
mode: delta, union
3*3*3*2
'''
radius = [1,2,3]
bits = [512,1024, 2048]
rb = [[radius[i], bits[i]] for i in range(len(radius))]
tokenizer = ['wl', 'path', 'morgan']
sketch = ['parity', 'minhash', 'cw']
mode = ['delta','union']


In [11]:
import pandas as pd 
name_csv = 'Syntemp_cluster'
df = pd.read_csv(f'data/raw/{name_csv}.csv.gz',compression='gzip')
df.head()

Unnamed: 0,R_ID,RSMI,New_R0,Split_R0,New_R1,Split_R1,New_R2,Split_R2
0,R_0,COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(OC)c...,0,train,0,train,0,train
1,R_1,Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...,1,train,13,train,6,test
2,R_4,CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...,2,train,45,test,58,train
3,R_5,Cn1cnc(-c2cc(C#N)ccn2)c1Br.OB(O)c1ccc(-n2cccn2...,3,train,90,train,679,train
4,R_6,CC1(C)OB(c2ccc(OCc3ccc4ccccc4n3)cc2)OC1(C)C.N#...,3,train,91,test,679,train


In [12]:
df_1 = df.loc[:,['RSMI','New_R0','Split_R0']]
print(df.shape)
print(df_1.shape)
print(df_1.head())
df_1.to_csv(f'data/raw/{name_csv}_0.csv.gz', compression='gzip')

(43441, 8)
(43441, 3)
                                                RSMI  New_R0 Split_R0
0  COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(OC)c...       0    train
1  Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...       1    train
2  CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...       2    train
3  Cn1cnc(-c2cc(C#N)ccn2)c1Br.OB(O)c1ccc(-n2cccn2...       3    train
4  CC1(C)OB(c2ccc(OCc3ccc4ccccc4n3)cc2)OC1(C)C.N#...       3    train


In [13]:
df_2 = df.loc[:,['RSMI','New_R1','Split_R1']]
print(df.shape)
print(df_2.shape)
print(df_2.head())
df_2.to_csv(f'data/raw/{name_csv}_1.csv.gz', compression='gzip')

(43441, 8)
(43441, 3)
                                                RSMI  New_R1 Split_R1
0  COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(OC)c...       0    train
1  Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...      13    train
2  CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...      45     test
3  Cn1cnc(-c2cc(C#N)ccn2)c1Br.OB(O)c1ccc(-n2cccn2...      90    train
4  CC1(C)OB(c2ccc(OCc3ccc4ccccc4n3)cc2)OC1(C)C.N#...      91     test


In [14]:
df_3 = df.loc[:,['RSMI','New_R2','Split_R2']]
print(df.shape)
print(df_3.shape)
print(df_3.head())
df_3.to_csv(f'data/raw/{name_csv}_2.csv.gz', compression='gzip')

(43441, 8)
(43441, 3)
                                                RSMI  New_R2 Split_R2
0  COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(OC)c...       0    train
1  Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...       6     test
2  CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...      58    train
3  Cn1cnc(-c2cc(C#N)ccn2)c1Br.OB(O)c1ccc(-n2cccn2...     679    train
4  CC1(C)OB(c2ccc(OCc3ccc4ccccc4n3)cc2)OC1(C)C.N#...     679    train


In [39]:
import pandas as pd 

df = pd.read_csv('data/raw/USPTO_TPL_unbalanced.csv.gz', compression='gzip')
df.head()

Unnamed: 0,rxn,rxn_class,split
0,CC(C)O.COc1ccc2nccc([C@@H](O)CC[C@@H]3CCN(C4CC...,366,test
1,CCCCSc1cc(C(=O)O)cc(S(N)(=O)=O)c1Cc1ccccc1.CCO...,47,test
2,CCCCCCCCCCCCCCCCCCOc1ccc(C(=O)C(=O)N(CC(=O)OCC...,255,test
3,CC(C)(C)OC(=O)N1CCN(C(=O)C(F)(F)F)CC1.ClCCl.O=...,575,test
4,CC1(C)CCCCC(C)(C)C1=O.CCOCC.O.O=S(=O)(O)O.[Al+...,91,test


In [40]:
df_new = pd.DataFrame({'rxn':df['rxn'],
                       'y': df['rxn_class'],
                       'split': df['split']})
print(df_new.shape)
df_new.head(2)

(445115, 3)


Unnamed: 0,rxn,y,split
0,CC(C)O.COc1ccc2nccc([C@@H](O)CC[C@@H]3CCN(C4CC...,366,test
1,CCCCSc1cc(C(=O)O)cc(S(N)(=O)=O)c1Cc1ccccc1.CCO...,47,test


In [41]:
df_new.to_csv('data/raw_new/USPTO_TPL_unbalanced.csv.gz',compression='gzip')

In [4]:
from pathlib import Path

folder_path = Path("data/raw_new")

# Get all files (ignoring folders inside)
files = [f.name for f in folder_path.iterdir() if f.is_file()]
files

['claire_full_1.csv.gz',
 'claire_full_2.csv.gz',
 'claire_full_3.csv.gz',
 'schneider50k_balanced.csv.gz',
 'schneider50k_unbalanced.csv.gz',
 'Syntemp_cluster_0.csv.gz',
 'Syntemp_cluster_1.csv.gz',
 'Syntemp_cluster_2.csv.gz',
 'USPTO_50k_balanced.csv.gz',
 'USPTO_50k_unbalanced.csv.gz',
 'USPTO_TPL_balanced.csv.gz',
 'USPTO_TPL_unbalanced.csv.gz']

In [55]:
import os
import glob
from sklearn.model_selection import train_test_split

source_folder = 'data/raw_new'
file_extension = '*.csv.gz'
search_path = os.path.join(source_folder, file_extension)
all_files = glob.glob(search_path)

all_files = [i.replace('\\','/') for i in all_files]
for data_path in all_files:
    df = pd.read_csv(data_path)
    _,df_new = train_test_split(df,test_size=200, random_state=42,stratify=df['split'])
    new_path = data_path.replace('raw_new', 'raw_trial')
    df_new.to_csv(new_path,compression='gzip')


In [1]:
from synrfp import synrfp
from typing import Optional, List

def fps(
    rxn: str,
    tokenizer: Optional[str] = "wl",
    radius: Optional[int] = 2,
    sketch: Optional[str] = "parity",
    mode: Optional[str] = "delta",
    bits: Optional[int] = 2048,
    seed: Optional[int] = 42,
    node_attrs: Optional[List[str]] = ['element', 'hcount', 'aromatic', "in_ring", "isomer", "hybridization"],
    edge_attrs: Optional[List[str]] = ['order', "ez_isomer", "conjugated", "in_ring"],
):
    """
    Compute a SynRFP fingerprint for a reaction SMILES.

    This helper is a thin wrapper around your `synrfp(...)` API.  Any argument
    left as `None` will fall back to the library/default behaviour.

    -----------------------------------------------------------------------
    Common (recommended) choices you can pass for the configurable arguments
    -----------------------------------------------------------------------
    tokenizer (str)
        - "wl"     : Weisfeiler–Lehman subtree labels (graph-native, default)
        - "nauty"  : Nauty/Traces canonical ego-subgraphs (canonical graphs)
        - "morgan" : Morgan / circular (ECFP-like) tokeniser (RDKit)
        - "path"   : Path-based tokenizer
        - other tokenizer class-names supported by your synrfp installation

    radius (int)
        - Typical values: 0, 1, 2, 3, 4
        - Recommended: 2 (good trade-off), 3 (more expressive, may overfit)
        - Must be a non-negative integer.

    sketch (str)
        - "parity"  : ParityFold / XOR-fold binary sketch (fast, good for ML)
        - "minhash" : MinHash sketch (Jaccard similarity friendly)
        - "cws"     : CWSketch (weighted minhash / weighted Jaccard)
        - "srp"     : SRPSketch / signed random projection (for cosine-like)
        - other sketch names supported by your synrfp installation

    mode (str)
        - "delta" : signed difference (P - R). Direction-aware; default for many tasks.
        - "union" : union/presence mode (R + P counts treated as positive weights).

    bits (int)
        - Common sizes: 64, 128, 256, 512, 1024, 2048, 4096
        - Choose based on collision tolerance / model capacity (1024 is a common sweet spot).

    seed (int)
        - Determinism seed for hashing/sketching. Typical default: 42.

    node_attrs (list[str] | None)
        - Typical node attributes: ["element", "hcount", "aromatic", "in_ring",
                                   "isomer", "hybridization", "formal_charge"]
        - If None, the tokenizer's defaults are used.

    edge_attrs (list[str] | None)
        - Typical edge attributes: ["order", "ez_isomer", "conjugated", "in_ring", "stereo"]
        - If None, the tokenizer's defaults are used.

    -----------------------------------------------------------------------
    Behaviour
    -----------------------------------------------------------------------
    - Any argument explicitly set to `None` will use library defaults.
    - The function returns the fingerprint object exactly as produced by
      `synrfp(...)` (commonly a list of 0/1 bits for parity sketches).
    - Use `mode="delta"` for direction-sensitive tasks (mechanistic classification);
      use `mode="union"` when you want presence-only features.
    - Increasing `radius` or `bits` increases expressiveness and memory / collision
      resistance respectively — but may reduce generalization if set too high.

    Example
    -------
    >>> # use defaults for everything except rxn string
    >>> fp = fps("CCO.O>>CC(=O)O")
    >>> # specify a few options
    >>> fp = fps("CCO>>CCO", tokenizer="morgan", radius=2, sketch="parity", bits=2048)

    -----------------------------------------------------------------------
    Notes
    -----------------------------------------------------------------------
    - For a programmatic list of available tokenizers/sketchers (on your system)
      you can probe the synrfp package or consult your environment. Typical
      installations include at least "wl" and "parity".
    - If you want metadata (delta counts, raw tokens) returned alongside the
      fingerprint, call the lower-level `SynRFP`/`SynRFPResult` APIs directly.
    """
    return synrfp(
        rxn,
        tokenizer=tokenizer,
        radius=radius if radius is not None else 2,
        sketch=sketch,
        mode=mode,
        bits=bits if bits is not None else 1024,
        seed=seed if seed is not None else 42,
        node_attrs=node_attrs,
        edge_attrs=edge_attrs,
    )

In [2]:
import pandas as pd
df = pd.read_csv('data/raw_trial/claire_full_1.csv.gz', compression = 'gzip')
print(df.shape)
df.head()

(200, 5)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,rxn,y,split
0,39605,39605,O.CCCCCCCCCCCC(O)CC(=O)O[C@H](COC(=O)CCCCCCCCC...,2,train
1,2106,2106,O.CCC=O.NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(...,0,train
2,169351,169351,[H+].CN1C2CCC1CC(=O)C2.NC(=O)C1=CN([C@@H]2O[C@...,0,test
3,35094,35094,C[S+](CCC(N)C(=O)O)CC1OC(n2cnc3c(N)ncnc32)C(O)...,1,train
4,124109,124109,CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)(O)OC[C@@H]...,1,train


In [3]:
import numpy as np
import torch

torch.tensor((np.array([synrfp(df.loc[i,'rxn']) for i in range(5)])))

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [6]:
import pandas as pd

df = pd.read_csv('data/raw_trial/USPTO_50k_balanced.csv.gz')
df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,rxn,y,split
0,18361,18361,C[C@@H]1CNCCN1.FC(F)(F)c1ccc(Br)nc1>>C[C@@H]1C...,0,train
1,2456,2456,CC1(C)OB(c2cccc(CN)c2)OC1(C)C.CCS(=O)(=O)N1CCC...,2,train


In [9]:
df_train = df[df['split']=='train']
df_train = df_train.reset_index()
df_train.loc[59,'rxn']

'CC(=O)C[P+](c1ccccc1)(c1ccccc1)c1ccccc1.CN1Cc2c(C=O)ncn2-c2cccc(Cl)c2C1=O>>CC(=O)/C=C/c1ncn2c1CN(C)C(=O)c1c(Cl)cccc1-2.O=[P+](c1ccccc1)(c1ccccc1)c1ccccc1.[H]'