Getting all of the smiles and then try to choose 70 percent for training, 10 percent for validation and 20 percent for test set. Making positive pairs and negative pairs for train, validation and test set

In [1]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from rdkit import RDLogger
from tqdm import tqdm

In [2]:
smiles_file ='/home/kvvq085/datasets/chembl/chembl_34_chemreps.txt' 
with open(smiles_file, 'r') as file:
        # Read header line and find index of canonical_smiles column
        header = file.readline().strip().split('\t')
        smiles_col_idx = header.index('canonical_smiles')
        smiles_list = [line.strip().split('\t')[smiles_col_idx] for line in file]

In [3]:
rd_logger = RDLogger.logger()
rd_logger.setLevel(RDLogger.CRITICAL)

In [4]:

np.random.seed(121274)

# Generate random indices for splitting
indices = np.random.permutation(len(smiles_list))

# Calculate split points
train_size = int(0.7 * len(smiles_list))
val_size = int(0.1 * len(smiles_list))

# Split indices
train_idx = indices[:train_size].tolist()
val_idx = indices[train_size:train_size+val_size].tolist()
test_idx = indices[train_size+val_size:].tolist()

In [5]:
X_train = [smiles_list[i] for i in train_idx]
X_val = [smiles_list[i] for i in val_idx]
X_test = [smiles_list[i] for i in test_idx]

Finding if each molecule has a substructure based on reactant 1 and reactant 2. If it has the substructure make pattern True

In [None]:
def parse_smiles(smiles_list):
    return [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Step 1: Read and parse approved rules
def load_approved_rules(file_path):
    approved_rules = []
    with open(file_path, 'r') as file:
        for line in file:
            rule = line.strip()
            try:
                reactant_smarts, product_smarts = rule.split(">>")
                smarts_1, smarts_2 = reactant_smarts.split(".")
                query1 = Chem.MolFromSmarts(smarts_1)
                query2 = Chem.MolFromSmarts(smarts_2)
                if query1 is not None and query2 is not None:
                    approved_rules.append({
                        'rule': rule,
                        'query1': query1,
                        'query2': query2
                    })
            except ValueError:
                # Log or handle improperly formatted rules if necessary
                continue
    return approved_rules

# Step 2: Pre-parse all molecules
def preprocess_molecules(X_train, X_val, X_test):
    data = {}
    data['train_smiles'] = X_train
    data['val_smiles'] = X_val
    data['test_smiles'] = X_test
    data['train_mols'] = parse_smiles(X_train)
    data['val_mols'] = parse_smiles(X_val)
    data['test_mols'] = parse_smiles(X_test)
    return data

# Step 3: Process rules and collect data
def process_rules(approved_rules, data):
    # Initialize lists to collect data
    train_data = []
    val_data = []
    test_data = []
    
    for rule in tqdm(approved_rules, desc="Processing Rules"):
        rule_str = rule['rule']
        query1 = rule['query1']
        query2 = rule['query2']
        
        # Process training set
        for smiles, mol in zip(data['train_smiles'], data['train_mols']):
            if mol is None:
                continue
            match1 = mol.HasSubstructMatch(query1)
            match2 = mol.HasSubstructMatch(query2)
            train_data.append((smiles, match1, match2, rule_str))
        
        # Process validation set
        for smiles, mol in zip(data['val_smiles'], data['val_mols']):
            if mol is None:
                continue
            match1 = mol.HasSubstructMatch(query1)
            match2 = mol.HasSubstructMatch(query2)
            val_data.append((smiles, match1, match2, rule_str))
        
        # Process test set
        for smiles, mol in zip(data['test_smiles'], data['test_mols']):
            if mol is None:
                continue
            match1 = mol.HasSubstructMatch(query1)
            match2 = mol.HasSubstructMatch(query2)
            test_data.append((smiles, match1, match2, rule_str))
    
    return train_data, val_data, test_data

# Step 4: Convert collected data into DataFrames
def create_dataframes(train_data, val_data, test_data):
    df_train = pd.DataFrame(train_data, columns=['molecule', 'pattern_1', 'pattern_2', 'rule'])
    df_val = pd.DataFrame(val_data, columns=['molecule', 'pattern_1', 'pattern_2', 'rule'])
    df_test = pd.DataFrame(test_data, columns=['molecule', 'pattern_1', 'pattern_2', 'rule'])
    return df_train, df_val, df_test

# Main Execution Flow


    # Load approved rules
approved_rules = load_approved_rules('hb_edited.txt')
print(f"Loaded {len(approved_rules)} approved rules.")

# Preprocess molecules
data = preprocess_molecules(X_train, X_val, X_test)
print("Pre-parsed all molecule SMILES.")

# Process rules and collect matching data
train_data, val_data, test_data = process_rules(approved_rules, data)
print("Completed processing all rules.")

# Create DataFrames
df_train, df_val, df_test = create_dataframes(train_data, val_data, test_data)
print("DataFrames created successfully.")

# Optional: Save DataFrames to CSV
# df_train.to_csv('df_train.csv', index=False)
# df_val.to_csv('df_val.csv', index=False)
# df_test.to_csv('df_test.csv', index=False)

# For demonstration, print first few rows
print("Training DataFrame:")
print(df_train.head())
print("\nValidation DataFrame:")
print(df_val.head())
print("\nTest DataFrame:")
print(df_test.head())

Loaded 56 approved rules.


In [14]:
df_train.to_csv("pattern_matching_training.csv")
df_val.to_csv("pattern_matching_val.csv")
df_test.to_csv("pattern_matching_test.csv")

In [15]:
len(df_train)

94443272

In [16]:
len(df_val)

13491912

In [17]:
len(df_test)

26983768

In [18]:
print(sum(df_test['pattern_2'] == True), sum(df_test['pattern_1'] == True))

1717499 778688


Making positive and negative pairs for train, validation and test

In [6]:
pattern_matching_train = pd.read_csv("pattern_matching_training.csv")
pattern_matching_val = pd.read_csv("pattern_matching_val.csv")
pattern_matching_test = pd.read_csv("pattern_matching_test.csv")

In [7]:
RANDOM_SEED = 121274
np.random.seed(RANDOM_SEED)

# Desired maximum number of positive pairs
MAX_PAIRS = 1000000

# Group by 'rule'
grouped_rules = pattern_matching_train.groupby('rule')

# Step 1: Calculate total possible pairs and per-rule pair counts
total_possible = 0
rule_pair_counts = {}

# First pass: compute possible pairs per rule
print("Calculating possible pairs per rule...")
for rule, group in tqdm(grouped_rules, desc="Calculating Pairs"):
    pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
    pattern2_molecules = group[group['pattern_2']]['molecule'].unique()
    num_p = len(pattern1_molecules) * len(pattern2_molecules)
    rule_pair_counts[rule] = num_p
    total_possible += num_p

print(f"Total possible pairs across all rules: {total_possible}")

Calculating possible pairs per rule...


Calculating Pairs: 100%|██████████| 56/56 [00:37<00:00,  1.51it/s]

Total possible pairs across all rules: 282092928544





Choosing positive_pairs and negative_pairs based on **sample_fraction**

Sampling Fraction is a number between 0 and 1 that determines what portion of the total possible pairs you will keep. Here's how it works:

Calculate Total Possible Pairs: First, find out how many pairs each rule can generate and add them all up.

Small Rules: If a rule generates very few pairs, it might contribute fewer than expected. The code ensures that at least one pair is sampled if possible, preventing rules from being entirely excluded.

For Example:

Rule A can generate 500,000 pairs

Rule B can generate 1,500,000 pairs

Total Possible Pairs = 500,000 + 1,500,000 = 2,000,000 pairs

Determine Sampling Fraction: Want to reduce the total from 2,000,000 pairs to 1,000,000 pairs.

Sampling Fraction = Desired Total Pairs / Total Possible Pairs

Sampling Fraction = 1,000,000 / 2,000,000 = 0.5

This means you'll keep 50% of the pairs from each rule.

Rule A:

Possible Pairs: 500,000

Pairs to Sample: 500,000 * 0.5 = 250,000

Rule B:

Possible Pairs: 1,500,000

Pairs to Sample: 1,500,000 * 0.5 = 750,000

Total Sampled Pairs: 250,000 (Rule A) + 750,000 (Rule B) = 1,000,000

**Train**

In [8]:
# If total_possible is less than MAX_PAIRS, proceed without sampling
if total_possible <= MAX_PAIRS:
    positive_pairs = []

    for rule, group in tqdm(grouped_rules, desc="Generating All Pairs"):
        pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
        pattern2_molecules = group[group['pattern_2']]['molecule'].unique()

        # Create all possible pairs using cross join
        pairs = pd.MultiIndex.from_product(
            [pattern1_molecules, pattern2_molecules],
            names=['mol1', 'mol2']
        ).to_frame(index=False)

        # Add 'rule' column
        pairs['rule'] = rule

        # Append to positive_pairs
        positive_pairs.append(pairs)

    # Concatenate all pairs into a single DataFrame
    positive_pairs_df = pd.concat(positive_pairs, ignore_index=True)
    print(f"Generated {len(positive_pairs_df)} positive pairs.")
else:
    # Need to sample pairs to limit total number to MAX_PAIRS
    positive_pairs = []

    # Calculate sampling fraction
    sampling_fraction = MAX_PAIRS / total_possible
    print(f"Sampling fraction: {sampling_fraction:.6f}")

    # Calculate the number of pairs to sample per rule
    sampled_pairs_per_rule = {}
    for rule, count in rule_pair_counts.items():
        sampled_pairs = int(count * sampling_fraction)
        # Ensure at least one pair is sampled if possible
        if sampled_pairs < 1 and count > 0:
            sampled_pairs = 1
        sampled_pairs_per_rule[rule] = sampled_pairs

    # Adjust total sampled pairs if sum exceeds MAX_PAIRS
    total_sampled = sum(sampled_pairs_per_rule.values())
    if total_sampled > MAX_PAIRS:
        # Reduce the number of pairs proportionally
        scaling_factor = MAX_PAIRS / total_sampled
        for rule in sampled_pairs_per_rule:
            sampled_pairs_per_rule[rule] = int(sampled_pairs_per_rule[rule] * scaling_factor)
    
    # Recalculate total sampled pairs after adjustment
    total_sampled = sum(sampled_pairs_per_rule.values())
    print(f"Total sampled pairs after adjustment: {total_sampled}")

    # Second pass: sample pairs per rule
    print("Sampling pairs per rule...")
    for rule, group in tqdm(grouped_rules, desc="Sampling Pairs"):
        num_to_sample = sampled_pairs_per_rule.get(rule, 0)
        if num_to_sample == 0:
            continue

        pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
        pattern2_molecules = group[group['pattern_2']]['molecule'].unique()

        n1 = len(pattern1_molecules)
        n2 = len(pattern2_molecules)

        if n1 == 0 or n2 == 0:
            continue  # No possible pairs for this rule

        # If the number of possible pairs is less than or equal to num_to_sample, take all
        if n1 * n2 <= num_to_sample:
            sampled_mol1 = np.repeat(pattern1_molecules, n2)
            sampled_mol2 = np.tile(pattern2_molecules, n1)
        else:
            # Randomly sample with replacement=False if possible
            # To sample unique pairs without replacement, we can sample indices
            sampled_indices_mol1 = np.random.choice(n1, size=num_to_sample, replace=True)
            sampled_indices_mol2 = np.random.choice(n2, size=num_to_sample, replace=True)
            sampled_mol1 = pattern1_molecules[sampled_indices_mol1]
            sampled_mol2 = pattern2_molecules[sampled_indices_mol2]

        # Create a DataFrame of sampled pairs
        sampled_df = pd.DataFrame({
            'molecule': sampled_mol1,
            'paired_molecule': sampled_mol2,
            'rule': rule
        })

        # Append to positive_pairs
        positive_pairs.append(sampled_df)

    # Concatenate all sampled pairs into a single DataFrame
    positive_pairs_df = pd.concat(positive_pairs, ignore_index=True)
    # Rename columns for consistency
    positive_pairs_df.rename(columns={'molecule': 'mol1', 'paired_molecule': 'mol2'}, inplace=True)
    print(f"Generated {len(positive_pairs_df)} sampled positive pairs.")

# Optional: Shuffle the positive_pairs_df
positive_pairs_df = positive_pairs_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Limit to MAX_PAIRS if necessary
if len(positive_pairs_df) > MAX_PAIRS:
    positive_pairs_df = positive_pairs_df.iloc[:MAX_PAIRS]

print(f"Final number of positive pairs: {len(positive_pairs_df)}")

# Display a preview
print(positive_pairs_df.head())


Sampling fraction: 0.000004
Total sampled pairs after adjustment: 999985
Sampling pairs per rule...


Sampling Pairs: 100%|██████████| 56/56 [00:24<00:00,  2.26it/s]


Generated 999985 sampled positive pairs.
Final number of positive pairs: 999985
                                                mol1  \
0         CCOC(=O)c1c(SC)c(C#N)c(-c2ccc(Br)cc2)oc1=O   
1      C#CCOc1ccc(CCNC(=O)C(OCC#C)c2ccc(Cl)cc2)cc1OC   
2  CCOC(=O)N[C@@H](c1cccnc1)[C@@H](O)C(=O)OC1CC2C...   
3                   CN(C)CCC#CC(O)(c1ccccc1)c1ccccc1   
4  N#Cc1ccc(N2C(=O)C3(CC3)N(c3ccc4c(c3)COC4=O)C2=...   

                                                mol2  \
0  CC(=O)O[C@H]1C[C@@](C)(C(=O)O)C[C@H]2C3=CC(=O)...   
1  CC(CO)/N=C(\N)c1c(O)nsc1Nc1ccc(Oc2cc(F)cc(F)c2...   
2      CC(NC(=O)CCn1nc(-c2ccc(Cl)cc2)ccc1=O)c1ccccc1   
3  NC(=O)[C@@]12C[C@@H]1[C@@H](n1cnc3c(N)nc(Cl)nc...   
4  O=C(O)c1ccc(Cl)cc1-c1ccc(/C=C2\C(=O)N(c3ccccc3...   

                                                rule  
0  [#6:6][C:5]#[#7;D1:4].[#6:1][C:2](=[OD1:3])[OH...  
1  [CH0;$(C-[#6]):1]#[CH1:2].[C;H1,H2;A;!$(C=O):3...  
2  [C$([C](O)([CX4])([CX4])([CX4])),C$([CH](O)([C...  
3  [CH0;$(C-[#6]):1]#[CH0;

In [9]:
positive_pairs_df.to_csv("pos_pairs_train_new.csv")

In [10]:
RANDOM_SEED = 121274
np.random.seed(RANDOM_SEED)

# Desired maximum number of negative pairs
MAX_PAIRS = 1_000_000

# Assume 'pattern_matching_train' is your DataFrame containing the relevant data
# It should have columns: 'molecule', 'pattern_1', 'pattern_2', 'rule'

# Group by 'rule'
grouped_rules = pattern_matching_train.groupby('rule')

# Step 1: Calculate total possible pairs and per-rule pair counts for negative pairs
total_possible = 0
rule_pair_counts = {}

print("Calculating possible negative pairs per rule...")
for rule, group in tqdm(grouped_rules, desc="Calculating Pairs"):
    # Select molecules where both pattern_1 and pattern_2 are False
    negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]

    negative_molecules = negative_group['molecule'].unique()
    num_neg = len(negative_molecules)

    # The number of possible ordered pairs without self-pairs
    num_p = num_neg * (num_neg - 1)
    rule_pair_counts[rule] = num_p
    total_possible += num_p

print(f"Total possible negative pairs across all rules: {total_possible:,}")

# Step 2: Determine Sampling Fraction
if total_possible <= MAX_PAIRS:
    print("Total possible pairs are within the desired maximum. Generating all pairs...")
    negative_pairs = []

    for rule, group in tqdm(grouped_rules, desc="Generating All Negative Pairs"):
        # Select molecules where both pattern_1 and pattern_2 are False
        negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]
        negative_molecules = negative_group['molecule'].unique()

        if len(negative_molecules) < 2:
            continue  # Not enough molecules to form pairs

        # Create all possible ordered pairs excluding self-pairs
        mol1, mol2 = np.meshgrid(negative_molecules, negative_molecules)
        mol1 = mol1.flatten()
        mol2 = mol2.flatten()

        # Exclude self-pairs
        valid_indices = mol1 != mol2
        mol1 = mol1[valid_indices]
        mol2 = mol2[valid_indices]

        pairs = pd.DataFrame({
            'mol1': mol1,
            'mol2': mol2,
            'rule': rule
        })

        negative_pairs.append(pairs)

    # Concatenate all pairs into a single DataFrame
    negative_pairs_df = pd.concat(negative_pairs, ignore_index=True)
    print(f"Generated {len(negative_pairs_df):,} negative pairs.")
else:
    print("Total possible pairs exceed the desired maximum. Sampling required...")
    negative_pairs = []

    # Calculate sampling fraction
    sampling_fraction = MAX_PAIRS / total_possible
    print(f"Sampling fraction: {sampling_fraction:.6f}")

    # Calculate the number of pairs to sample per rule
    sampled_pairs_per_rule = {}
    for rule, count in rule_pair_counts.items():
        sampled_pairs = int(count * sampling_fraction)
        # Ensure at least one pair is sampled if possible
        if sampled_pairs < 1 and count > 0:
            sampled_pairs = 1
        sampled_pairs_per_rule[rule] = sampled_pairs

    # Adjust total sampled pairs if sum exceeds MAX_PAIRS
    total_sampled = sum(sampled_pairs_per_rule.values())
    if total_sampled > MAX_PAIRS:
        scaling_factor = MAX_PAIRS / total_sampled
        for rule in sampled_pairs_per_rule:
            sampled_pairs_per_rule[rule] = int(sampled_pairs_per_rule[rule] * scaling_factor)

    # Recalculate total sampled pairs after adjustment
    total_sampled = sum(sampled_pairs_per_rule.values())
    print(f"Total sampled pairs after adjustment: {total_sampled:,}")

    # Step 3: Sample pairs per rule without mapping to unique indices
    print("Sampling negative pairs per rule...")
    for rule, group in tqdm(grouped_rules, desc="Sampling Pairs"):
        num_to_sample = sampled_pairs_per_rule.get(rule, 0)
        if num_to_sample == 0:
            continue

        # Select molecules where both patterns are False
        negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]
        negative_molecules = negative_group['molecule'].unique()
        n = len(negative_molecules)

        if n < 2:
            continue  # Not enough molecules to form pairs

        # Sample mol1 and mol2 with replacement
        # Since n is large and num_to_sample is relatively small, probability of duplicates is low
        mol1 = np.random.choice(negative_molecules, size=num_to_sample, replace=True)
        mol2 = np.random.choice(negative_molecules, size=num_to_sample, replace=True)

        # Exclude self-pairs
        mask = mol1 != mol2
        mol1 = mol1[mask]
        mol2 = mol2[mask]

        # If after masking, we have fewer pairs than desired, resample the missing
        missing = num_to_sample - len(mol1)
        while missing > 0:
            additional_mol1 = np.random.choice(negative_molecules, size=missing, replace=True)
            additional_mol2 = np.random.choice(negative_molecules, size=missing, replace=True)
            additional_mask = additional_mol1 != additional_mol2
            mol1 = np.concatenate([mol1, additional_mol1[additional_mask]])
            mol2 = np.concatenate([mol2, additional_mol2[additional_mask]])
            missing = num_to_sample - len(mol1)

        # Create a DataFrame of sampled negative pairs
        sampled_df = pd.DataFrame({
            'mol1': mol1,
            'mol2': mol2,
            'rule': rule
        })

        # Append to negative_pairs
        negative_pairs.append(sampled_df)

    # Concatenate all sampled pairs into a single DataFrame
    negative_pairs_df = pd.concat(negative_pairs, ignore_index=True)

    # Shuffle the DataFrame to ensure randomness
    negative_pairs_df = negative_pairs_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    # Limit to MAX_PAIRS if necessary
    if len(negative_pairs_df) > MAX_PAIRS:
        negative_pairs_df = negative_pairs_df.iloc[:MAX_PAIRS]

    print(f"Generated {len(negative_pairs_df):,} sampled negative pairs.")

# Final DataFrame: negative_pairs_df
print(f"Final number of negative pairs: {len(negative_pairs_df):,}")

# Display a preview
print(negative_pairs_df.head())


Calculating possible negative pairs per rule...


Calculating Pairs: 100%|██████████| 56/56 [00:47<00:00,  1.18it/s]


Total possible negative pairs across all rules: 133,014,814,709,442
Total possible pairs exceed the desired maximum. Sampling required...
Sampling fraction: 0.000000
Total sampled pairs after adjustment: 999,974
Sampling negative pairs per rule...


Sampling Pairs: 100%|██████████| 56/56 [00:37<00:00,  1.50it/s]


Generated 999,974 sampled negative pairs.
Final number of negative pairs: 999,974
                                                mol1  \
0     CC(=O)O.Nc1nc(N)c2cc(CNc3cccc(Cl)c3)ccc2n1.O.O   
1                  CCCCc1ccc2nc(NC(=O)c3sccc3C)sc2c1   
2  Cc1c(C(=O)NCCC(C)C)sc2ncnc(Oc3ccc(NC(=O)C4(C(=...   
3         COc1cc(-c2cn(-c3c(O)c(F)cc(F)c3F)nn2)ccc1O   
4  Cc1csc(-c2cnc(N[C@@H]3CCN(C)C[C@H]3C(=O)NC3CCC...   

                                                mol2  \
0                           CC(C)NCc1cc2c(nc1O)CCCC2   
1           O=C(Nc1nnc(-c2ccc3c(c2)OCCO3)o1)c1ccccc1   
2   Cc1cc(C)c(C(=O)CC2(C(F)(F)C(F)F)NCCN2)c(=O)[nH]1   
3      O=C(c1ccc(Cl)c(S(=O)(=O)N2CCCCCC2)c1)N1CCOCC1   
4  O=C(CCc1cccs1)N1CCN(C(=O)C2CCC2)[C@H]2CS(=O)(=...   

                                                rule  
0  [C;H1&$(C([#6])[#6]),H2&$(C[#6]):1][OH1].[#7:2...  
1  [N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=...  
2  [CH0;$(C-[#6]):1]#[NH0:2].[C;A;!$(C=O):3]-[*;#...  
3  [C$(C(=C)([CX4])([CX4

In [11]:
negative_pairs_df.to_csv("neg_pairs_train_new.csv")

**Validation**

In [12]:
RANDOM_SEED = 121274
np.random.seed(RANDOM_SEED)

# Desired maximum number of positive pairs
MAX_PAIRS = 100_000

# Group by 'rule'
grouped_rules = pattern_matching_val.groupby('rule')

# Step 1: Calculate total possible pairs and per-rule pair counts
total_possible = 0
rule_pair_counts = {}

# First pass: compute possible pairs per rule
print("Calculating possible pairs per rule...")
for rule, group in tqdm(grouped_rules, desc="Calculating Pairs"):
    pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
    pattern2_molecules = group[group['pattern_2']]['molecule'].unique()
    num_p = len(pattern1_molecules) * len(pattern2_molecules)
    rule_pair_counts[rule] = num_p
    total_possible += num_p

print(f"Total possible pairs across all rules: {total_possible}")

Calculating possible pairs per rule...


Calculating Pairs: 100%|██████████| 56/56 [00:02<00:00, 27.88it/s]

Total possible pairs across all rules: 5828573699





In [13]:
# If total_possible is less than MAX_PAIRS, proceed without sampling
if total_possible <= MAX_PAIRS:
    positive_pairs = []

    for rule, group in tqdm(grouped_rules, desc="Generating All Pairs"):
        pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
        pattern2_molecules = group[group['pattern_2']]['molecule'].unique()

        # Create all possible pairs using cross join
        pairs = pd.MultiIndex.from_product(
            [pattern1_molecules, pattern2_molecules],
            names=['mol1', 'mol2']
        ).to_frame(index=False)

        # Add 'rule' column
        pairs['rule'] = rule

        # Append to positive_pairs
        positive_pairs.append(pairs)

    # Concatenate all pairs into a single DataFrame
    positive_pairs_df = pd.concat(positive_pairs, ignore_index=True)
    print(f"Generated {len(positive_pairs_df)} positive pairs.")
else:
    # Need to sample pairs to limit total number to MAX_PAIRS
    positive_pairs = []

    # Calculate sampling fraction
    sampling_fraction = MAX_PAIRS / total_possible
    print(f"Sampling fraction: {sampling_fraction:.6f}")

    # Calculate the number of pairs to sample per rule
    sampled_pairs_per_rule = {}
    for rule, count in rule_pair_counts.items():
        sampled_pairs = int(count * sampling_fraction)
        # Ensure at least one pair is sampled if possible
        if sampled_pairs < 1 and count > 0:
            sampled_pairs = 1
        sampled_pairs_per_rule[rule] = sampled_pairs

    # Adjust total sampled pairs if sum exceeds MAX_PAIRS
    total_sampled = sum(sampled_pairs_per_rule.values())
    if total_sampled > MAX_PAIRS:
        # Reduce the number of pairs proportionally
        scaling_factor = MAX_PAIRS / total_sampled
        for rule in sampled_pairs_per_rule:
            sampled_pairs_per_rule[rule] = int(sampled_pairs_per_rule[rule] * scaling_factor)
    
    # Recalculate total sampled pairs after adjustment
    total_sampled = sum(sampled_pairs_per_rule.values())
    print(f"Total sampled pairs after adjustment: {total_sampled}")

    # Second pass: sample pairs per rule
    print("Sampling pairs per rule...")
    for rule, group in tqdm(grouped_rules, desc="Sampling Pairs"):
        num_to_sample = sampled_pairs_per_rule.get(rule, 0)
        if num_to_sample == 0:
            continue

        pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
        pattern2_molecules = group[group['pattern_2']]['molecule'].unique()

        n1 = len(pattern1_molecules)
        n2 = len(pattern2_molecules)

        if n1 == 0 or n2 == 0:
            continue  # No possible pairs for this rule

        # If the number of possible pairs is less than or equal to num_to_sample, take all
        if n1 * n2 <= num_to_sample:
            sampled_mol1 = np.repeat(pattern1_molecules, n2)
            sampled_mol2 = np.tile(pattern2_molecules, n1)
        else:
            # Randomly sample with replacement=False if possible
            # To sample unique pairs without replacement, we can sample indices
            sampled_indices_mol1 = np.random.choice(n1, size=num_to_sample, replace=True)
            sampled_indices_mol2 = np.random.choice(n2, size=num_to_sample, replace=True)
            sampled_mol1 = pattern1_molecules[sampled_indices_mol1]
            sampled_mol2 = pattern2_molecules[sampled_indices_mol2]

        # Create a DataFrame of sampled pairs
        sampled_df = pd.DataFrame({
            'molecule': sampled_mol1,
            'paired_molecule': sampled_mol2,
            'rule': rule
        })

        # Append to positive_pairs
        positive_pairs.append(sampled_df)

    # Concatenate all sampled pairs into a single DataFrame
    positive_pairs_df = pd.concat(positive_pairs, ignore_index=True)
    # Rename columns for consistency
    positive_pairs_df.rename(columns={'molecule': 'mol1', 'paired_molecule': 'mol2'}, inplace=True)
    print(f"Generated {len(positive_pairs_df)} sampled positive pairs.")

# Optional: Shuffle the positive_pairs_df
positive_pairs_df = positive_pairs_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Limit to MAX_PAIRS if necessary
if len(positive_pairs_df) > MAX_PAIRS:
    positive_pairs_df = positive_pairs_df.iloc[:MAX_PAIRS]

print(f"Final number of positive pairs: {len(positive_pairs_df)}")

# Display a preview
print(positive_pairs_df.head())


Sampling fraction: 0.000017
Total sampled pairs after adjustment: 99989
Sampling pairs per rule...


Sampling Pairs: 100%|██████████| 56/56 [00:01<00:00, 47.56it/s]


Generated 99989 sampled positive pairs.
Final number of positive pairs: 99989
                                                mol1  \
0  CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(O)c(F)c1)NC...   
1  Cc1cc(C#N)cc(C)c1Oc1nc(NC2CCN(c3c(F)cc(NS(N)(=...   
2  C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1C[C@@]2(CN1C(=O)...   
3                    NC(CC(=O)O)c1ccccc1[N+](=O)[O-]   
4    Cc1c(C)c2ccc(OC3CCN(Cc4ccc(C#N)cc4)CC3)cc2oc1=O   

                                                mol2  \
0  CNC(=O)OCCc1ccc(Cl)c(CN(C(=O)[C@H]2CNCC(=O)N2c...   
1                          NOC(c1ccccc1)c1cccc(Br)c1   
2                        Oc1nc2nonc2nc1-c1ccc(Br)cc1   
3  Nc1ccc(CCn2cnc3c(Nc4cccc(N)c4)nc(NC4CCC4)nc32)cc1   
4           O=C(O)c1ccnc(-n2nc(Cc3c(F)cccc3F)cc2O)c1   

                                                rule  
0  [Cl,OH,O-:3][C$(C(=O)([CX4,c])),C$([CH](=O)):2...  
1  [#6:1][C:2]#[#7;D1].[Cl,Br,I][#6;$([#6]~[#6]);...  
2  [C$([CH](=C)([CX4])),C$([CH2](=C)):2]=[C$(C(=C...  
3  [Cl,OH,O-:3][C$(C(=O)([CX

In [14]:
positive_pairs_df.to_csv("pos_pairs_val_new.csv")

In [15]:
RANDOM_SEED = 121274
np.random.seed(RANDOM_SEED)

# Desired maximum number of negative pairs
MAX_PAIRS = 100_000

# Assume 'pattern_matching_train' is your DataFrame containing the relevant data
# It should have columns: 'molecule', 'pattern_1', 'pattern_2', 'rule'

# Group by 'rule'
grouped_rules = pattern_matching_val.groupby('rule')

# Step 1: Calculate total possible pairs and per-rule pair counts for negative pairs
total_possible = 0
rule_pair_counts = {}

print("Calculating possible negative pairs per rule...")
for rule, group in tqdm(grouped_rules, desc="Calculating Pairs"):
    # Select molecules where both pattern_1 and pattern_2 are False
    negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]

    negative_molecules = negative_group['molecule'].unique()
    num_neg = len(negative_molecules)

    # The number of possible ordered pairs without self-pairs
    num_p = num_neg * (num_neg - 1)
    rule_pair_counts[rule] = num_p
    total_possible += num_p

print(f"Total possible negative pairs across all rules: {total_possible:,}")

# Step 2: Determine Sampling Fraction
if total_possible <= MAX_PAIRS:
    print("Total possible pairs are within the desired maximum. Generating all pairs...")
    negative_pairs = []

    for rule, group in tqdm(grouped_rules, desc="Generating All Negative Pairs"):
        # Select molecules where both pattern_1 and pattern_2 are False
        negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]
        negative_molecules = negative_group['molecule'].unique()

        if len(negative_molecules) < 2:
            continue  # Not enough molecules to form pairs

        # Create all possible ordered pairs excluding self-pairs
        mol1, mol2 = np.meshgrid(negative_molecules, negative_molecules)
        mol1 = mol1.flatten()
        mol2 = mol2.flatten()

        # Exclude self-pairs
        valid_indices = mol1 != mol2
        mol1 = mol1[valid_indices]
        mol2 = mol2[valid_indices]

        pairs = pd.DataFrame({
            'mol1': mol1,
            'mol2': mol2,
            'rule': rule
        })

        negative_pairs.append(pairs)

    # Concatenate all pairs into a single DataFrame
    negative_pairs_df = pd.concat(negative_pairs, ignore_index=True)
    print(f"Generated {len(negative_pairs_df):,} negative pairs.")
else:
    print("Total possible pairs exceed the desired maximum. Sampling required...")
    negative_pairs = []

    # Calculate sampling fraction
    sampling_fraction = MAX_PAIRS / total_possible
    print(f"Sampling fraction: {sampling_fraction:.6f}")

    # Calculate the number of pairs to sample per rule
    sampled_pairs_per_rule = {}
    for rule, count in rule_pair_counts.items():
        sampled_pairs = int(count * sampling_fraction)
        # Ensure at least one pair is sampled if possible
        if sampled_pairs < 1 and count > 0:
            sampled_pairs = 1
        sampled_pairs_per_rule[rule] = sampled_pairs

    # Adjust total sampled pairs if sum exceeds MAX_PAIRS
    total_sampled = sum(sampled_pairs_per_rule.values())
    if total_sampled > MAX_PAIRS:
        scaling_factor = MAX_PAIRS / total_sampled
        for rule in sampled_pairs_per_rule:
            sampled_pairs_per_rule[rule] = int(sampled_pairs_per_rule[rule] * scaling_factor)

    # Recalculate total sampled pairs after adjustment
    total_sampled = sum(sampled_pairs_per_rule.values())
    print(f"Total sampled pairs after adjustment: {total_sampled:,}")

    # Step 3: Sample pairs per rule without mapping to unique indices
    print("Sampling negative pairs per rule...")
    for rule, group in tqdm(grouped_rules, desc="Sampling Pairs"):
        num_to_sample = sampled_pairs_per_rule.get(rule, 0)
        if num_to_sample == 0:
            continue

        # Select molecules where both patterns are False
        negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]
        negative_molecules = negative_group['molecule'].unique()
        n = len(negative_molecules)

        if n < 2:
            continue  # Not enough molecules to form pairs

        # Sample mol1 and mol2 with replacement
        # Since n is large and num_to_sample is relatively small, probability of duplicates is low
        mol1 = np.random.choice(negative_molecules, size=num_to_sample, replace=True)
        mol2 = np.random.choice(negative_molecules, size=num_to_sample, replace=True)

        # Exclude self-pairs
        mask = mol1 != mol2
        mol1 = mol1[mask]
        mol2 = mol2[mask]

        # If after masking, we have fewer pairs than desired, resample the missing
        missing = num_to_sample - len(mol1)
        while missing > 0:
            additional_mol1 = np.random.choice(negative_molecules, size=missing, replace=True)
            additional_mol2 = np.random.choice(negative_molecules, size=missing, replace=True)
            additional_mask = additional_mol1 != additional_mol2
            mol1 = np.concatenate([mol1, additional_mol1[additional_mask]])
            mol2 = np.concatenate([mol2, additional_mol2[additional_mask]])
            missing = num_to_sample - len(mol1)

        # Create a DataFrame of sampled negative pairs
        sampled_df = pd.DataFrame({
            'mol1': mol1,
            'mol2': mol2,
            'rule': rule
        })

        # Append to negative_pairs
        negative_pairs.append(sampled_df)

    # Concatenate all sampled pairs into a single DataFrame
    negative_pairs_df = pd.concat(negative_pairs, ignore_index=True)

    # Shuffle the DataFrame to ensure randomness
    negative_pairs_df = negative_pairs_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    # Limit to MAX_PAIRS if necessary
    if len(negative_pairs_df) > MAX_PAIRS:
        negative_pairs_df = negative_pairs_df.iloc[:MAX_PAIRS]

    print(f"Generated {len(negative_pairs_df):,} sampled negative pairs.")

# Final DataFrame: negative_pairs_df
print(f"Final number of negative pairs: {len(negative_pairs_df):,}")

# Display a preview
print(negative_pairs_df.head())


Calculating possible negative pairs per rule...


Calculating Pairs: 100%|██████████| 56/56 [00:03<00:00, 15.00it/s]


Total possible negative pairs across all rules: 2,712,584,459,676
Total possible pairs exceed the desired maximum. Sampling required...
Sampling fraction: 0.000000
Total sampled pairs after adjustment: 99,971
Sampling negative pairs per rule...


Sampling Pairs: 100%|██████████| 56/56 [00:03<00:00, 15.11it/s]


Generated 99,971 sampled negative pairs.
Final number of negative pairs: 99,971
                                                mol1  \
0                  Cc1ccc(-n2nc(C)c3onc(C)c3c2=O)cc1   
1   CCc1nc2ccc(C(=O)NCc3ccc(OC)cc3)cn2c1N(CC)CCN(C)C   
2          CC/N=c1/cc2oc3cc(NCCCN)c4ccccc4c3nc-2cc1C   
3  O=C(O)Cc1ccc(-c2ccccc2NC(=O)Cc2cccc(-c3ccc(O)c...   
4                CCN1CCSc2ccc(C(=O)NCc3ccccc3Br)cc21   

                                                mol2  \
0         Cc1nc(Cn2nc(C3CCNCC3)n(Cc3ccccc3)c2=O)sc1C   
1  COc1cc(-c2cn(C3C(=O)NC4CCC3C4)nn2)ccc1-n1cnc(C)c1   
2      CC(=O)N1c2ccccc2C(=O)C1N1CCN(c2cccc(Cl)c2)CC1   
3  CC1(C)N2Cc3[nH]c4ccccc4c3C[C@H]2C(=O)N1[C@@H](...   
4  NC(=O)c1cccc2c1CC(N(CCCc1c[nH]c3ccc(F)cc13)C1C...   

                                                rule  
0  [c;r6:1](-[NH1;$(N-[#6]):2]):[c;r6:3](-[NH2:4]...  
1  [NH2,NH3+1:8]-[c:5]1[cH:4][c:3][c:2][c:1][c:6]...  
2  [c:1](-[OH1;$(Oc1ccccc1):2]):[c;r6:3](-[NH2:4]...  
3  [#6:1][C:2]#[#7;D1].[Cl

In [16]:
negative_pairs_df.to_csv("neg_pairs_val_new.csv")

**Test**

In [17]:
RANDOM_SEED = 121274
np.random.seed(RANDOM_SEED)

# Desired maximum number of positive pairs
MAX_PAIRS = 100_000

# Group by 'rule'
grouped_rules = pattern_matching_test.groupby('rule')

# Step 1: Calculate total possible pairs and per-rule pair counts
total_possible = 0
rule_pair_counts = {}

# First pass: compute possible pairs per rule
print("Calculating possible pairs per rule...")
for rule, group in tqdm(grouped_rules, desc="Calculating Pairs"):
    pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
    pattern2_molecules = group[group['pattern_2']]['molecule'].unique()
    num_p = len(pattern1_molecules) * len(pattern2_molecules)
    rule_pair_counts[rule] = num_p
    total_possible += num_p

print(f"Total possible pairs across all rules: {total_possible}")

# If total_possible is less than MAX_PAIRS, proceed without sampling
if total_possible <= MAX_PAIRS:
    positive_pairs = []

    for rule, group in tqdm(grouped_rules, desc="Generating All Pairs"):
        pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
        pattern2_molecules = group[group['pattern_2']]['molecule'].unique()

        # Create all possible pairs using cross join
        pairs = pd.MultiIndex.from_product(
            [pattern1_molecules, pattern2_molecules],
            names=['mol1', 'mol2']
        ).to_frame(index=False)

        # Add 'rule' column
        pairs['rule'] = rule

        # Append to positive_pairs
        positive_pairs.append(pairs)

    # Concatenate all pairs into a single DataFrame
    positive_pairs_df = pd.concat(positive_pairs, ignore_index=True)
    print(f"Generated {len(positive_pairs_df)} positive pairs.")
else:
    # Need to sample pairs to limit total number to MAX_PAIRS
    positive_pairs = []

    # Calculate sampling fraction
    sampling_fraction = MAX_PAIRS / total_possible
    print(f"Sampling fraction: {sampling_fraction:.6f}")

    # Calculate the number of pairs to sample per rule
    sampled_pairs_per_rule = {}
    for rule, count in rule_pair_counts.items():
        sampled_pairs = int(count * sampling_fraction)
        # Ensure at least one pair is sampled if possible
        if sampled_pairs < 1 and count > 0:
            sampled_pairs = 1
        sampled_pairs_per_rule[rule] = sampled_pairs

    # Adjust total sampled pairs if sum exceeds MAX_PAIRS
    total_sampled = sum(sampled_pairs_per_rule.values())
    if total_sampled > MAX_PAIRS:
        # Reduce the number of pairs proportionally
        scaling_factor = MAX_PAIRS / total_sampled
        for rule in sampled_pairs_per_rule:
            sampled_pairs_per_rule[rule] = int(sampled_pairs_per_rule[rule] * scaling_factor)
    
    # Recalculate total sampled pairs after adjustment
    total_sampled = sum(sampled_pairs_per_rule.values())
    print(f"Total sampled pairs after adjustment: {total_sampled}")

    # Second pass: sample pairs per rule
    print("Sampling pairs per rule...")
    for rule, group in tqdm(grouped_rules, desc="Sampling Pairs"):
        num_to_sample = sampled_pairs_per_rule.get(rule, 0)
        if num_to_sample == 0:
            continue

        pattern1_molecules = group[group['pattern_1']]['molecule'].unique()
        pattern2_molecules = group[group['pattern_2']]['molecule'].unique()

        n1 = len(pattern1_molecules)
        n2 = len(pattern2_molecules)

        if n1 == 0 or n2 == 0:
            continue  # No possible pairs for this rule

        # If the number of possible pairs is less than or equal to num_to_sample, take all
        if n1 * n2 <= num_to_sample:
            sampled_mol1 = np.repeat(pattern1_molecules, n2)
            sampled_mol2 = np.tile(pattern2_molecules, n1)
        else:
            # Randomly sample with replacement=False if possible
            # To sample unique pairs without replacement, we can sample indices
            sampled_indices_mol1 = np.random.choice(n1, size=num_to_sample, replace=True)
            sampled_indices_mol2 = np.random.choice(n2, size=num_to_sample, replace=True)
            sampled_mol1 = pattern1_molecules[sampled_indices_mol1]
            sampled_mol2 = pattern2_molecules[sampled_indices_mol2]

        # Create a DataFrame of sampled pairs
        sampled_df = pd.DataFrame({
            'molecule': sampled_mol1,
            'paired_molecule': sampled_mol2,
            'rule': rule
        })

        # Append to positive_pairs
        positive_pairs.append(sampled_df)

    # Concatenate all sampled pairs into a single DataFrame
    positive_pairs_df = pd.concat(positive_pairs, ignore_index=True)
    # Rename columns for consistency
    positive_pairs_df.rename(columns={'molecule': 'mol1', 'paired_molecule': 'mol2'}, inplace=True)
    print(f"Generated {len(positive_pairs_df)} sampled positive pairs.")

# Optional: Shuffle the positive_pairs_df
positive_pairs_df = positive_pairs_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Limit to MAX_PAIRS if necessary
if len(positive_pairs_df) > MAX_PAIRS:
    positive_pairs_df = positive_pairs_df.iloc[:MAX_PAIRS]

print(f"Final number of positive pairs: {len(positive_pairs_df)}")

# Display a preview
print(positive_pairs_df.head())


Calculating possible pairs per rule...


Calculating Pairs: 100%|██████████| 56/56 [00:01<00:00, 28.44it/s]


Total possible pairs across all rules: 23004542090
Sampling fraction: 0.000004
Total sampled pairs after adjustment: 99993
Sampling pairs per rule...


Sampling Pairs: 100%|██████████| 56/56 [00:01<00:00, 29.31it/s]

Generated 99993 sampled positive pairs.
Final number of positive pairs: 99993
                                                mol1  \
0          Cc1cc(NCc2cccc(Cl)c2Cl)c2cccc(C(=O)O)c2n1   
1  Cc1ccc(C(=O)N2CC(c3ccc(C#N)cc3)C2)cc1-c1[nH]c(...   
2  CC1(C)C=C2[C@H]3CC[C@@H]4[C@@]5(C)CCC(=O)C(C)(...   
3          CN(C)/C=N/[C@@H](C(=O)[O-])c1ccccc1.[Na+]   
4      C[C@@H]1CCCN1CCc1ccc2nc(-c3ccc(C#N)cc3)ccc2c1   

                                                mol2  \
0  COc1cc(-c2ccc3ncc(C(C)=O)c(N[C@@H]4CCC[C@@H](C...   
1                      COc1ccc2c(C)c(Br)c(=O)oc2c1Br   
2  C=CCO[C@@H]1[C@H](O)[C@H](Oc2ccc(I)cc2)O[C@@H]...   
3  CC(C)CCC[C@@H](C)CCC[C@@H](C)CCC[C@@H](C)CCOC[...   
4            CCOC(=O)c1cc(-c2ccc(Cl)cc2)nc2onc(C)c12   

                                                rule  
0  [Cl,OH,O-:3][C$(C(=O)([CX4,c])),C$([CH](=O)):2...  
1  [#6:1][C:2]#[#7;D1].[Cl,Br,I][#6;$([#6]~[#6]);...  
2  [C$([CH](=C)([CX4])),C$([CH2](=C)):2]=[C$(C(=C...  
3  [OH,O-]-[C$(C(=O)(O)([CX4




In [18]:
positive_pairs_df.to_csv("pos_pairs_test_new.csv")

In [19]:
RANDOM_SEED = 121274
np.random.seed(RANDOM_SEED)

# Desired maximum number of negative pairs
MAX_PAIRS = 100_000

# Assume 'pattern_matching_train' is your DataFrame containing the relevant data
# It should have columns: 'molecule', 'pattern_1', 'pattern_2', 'rule'

# Group by 'rule'
grouped_rules = pattern_matching_test.groupby('rule')

# Step 1: Calculate total possible pairs and per-rule pair counts for negative pairs
total_possible = 0
rule_pair_counts = {}

print("Calculating possible negative pairs per rule...")
for rule, group in tqdm(grouped_rules, desc="Calculating Pairs"):
    # Select molecules where both pattern_1 and pattern_2 are False
    negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]

    negative_molecules = negative_group['molecule'].unique()
    num_neg = len(negative_molecules)

    # The number of possible ordered pairs without self-pairs
    num_p = num_neg * (num_neg - 1)
    rule_pair_counts[rule] = num_p
    total_possible += num_p

print(f"Total possible negative pairs across all rules: {total_possible:,}")

# Step 2: Determine Sampling Fraction
if total_possible <= MAX_PAIRS:
    print("Total possible pairs are within the desired maximum. Generating all pairs...")
    negative_pairs = []

    for rule, group in tqdm(grouped_rules, desc="Generating All Negative Pairs"):
        # Select molecules where both pattern_1 and pattern_2 are False
        negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]
        negative_molecules = negative_group['molecule'].unique()

        if len(negative_molecules) < 2:
            continue  # Not enough molecules to form pairs

        # Create all possible ordered pairs excluding self-pairs
        mol1, mol2 = np.meshgrid(negative_molecules, negative_molecules)
        mol1 = mol1.flatten()
        mol2 = mol2.flatten()

        # Exclude self-pairs
        valid_indices = mol1 != mol2
        mol1 = mol1[valid_indices]
        mol2 = mol2[valid_indices]

        pairs = pd.DataFrame({
            'mol1': mol1,
            'mol2': mol2,
            'rule': rule
        })

        negative_pairs.append(pairs)

    # Concatenate all pairs into a single DataFrame
    negative_pairs_df = pd.concat(negative_pairs, ignore_index=True)
    print(f"Generated {len(negative_pairs_df):,} negative pairs.")
else:
    print("Total possible pairs exceed the desired maximum. Sampling required...")
    negative_pairs = []

    # Calculate sampling fraction
    sampling_fraction = MAX_PAIRS / total_possible
    print(f"Sampling fraction: {sampling_fraction:.6f}")

    # Calculate the number of pairs to sample per rule
    sampled_pairs_per_rule = {}
    for rule, count in rule_pair_counts.items():
        sampled_pairs = int(count * sampling_fraction)
        # Ensure at least one pair is sampled if possible
        if sampled_pairs < 1 and count > 0:
            sampled_pairs = 1
        sampled_pairs_per_rule[rule] = sampled_pairs

    # Adjust total sampled pairs if sum exceeds MAX_PAIRS
    total_sampled = sum(sampled_pairs_per_rule.values())
    if total_sampled > MAX_PAIRS:
        scaling_factor = MAX_PAIRS / total_sampled
        for rule in sampled_pairs_per_rule:
            sampled_pairs_per_rule[rule] = int(sampled_pairs_per_rule[rule] * scaling_factor)

    # Recalculate total sampled pairs after adjustment
    total_sampled = sum(sampled_pairs_per_rule.values())
    print(f"Total sampled pairs after adjustment: {total_sampled:,}")

    # Step 3: Sample pairs per rule without mapping to unique indices
    print("Sampling negative pairs per rule...")
    for rule, group in tqdm(grouped_rules, desc="Sampling Pairs"):
        num_to_sample = sampled_pairs_per_rule.get(rule, 0)
        if num_to_sample == 0:
            continue

        # Select molecules where both patterns are False
        negative_group = group[(~group['pattern_1']) & (~group['pattern_2'])]
        negative_molecules = negative_group['molecule'].unique()
        n = len(negative_molecules)

        if n < 2:
            continue  # Not enough molecules to form pairs

        # Sample mol1 and mol2 with replacement
        # Since n is large and num_to_sample is relatively small, probability of duplicates is low
        mol1 = np.random.choice(negative_molecules, size=num_to_sample, replace=True)
        mol2 = np.random.choice(negative_molecules, size=num_to_sample, replace=True)

        # Exclude self-pairs
        mask = mol1 != mol2
        mol1 = mol1[mask]
        mol2 = mol2[mask]

        # If after masking, we have fewer pairs than desired, resample the missing
        missing = num_to_sample - len(mol1)
        while missing > 0:
            additional_mol1 = np.random.choice(negative_molecules, size=missing, replace=True)
            additional_mol2 = np.random.choice(negative_molecules, size=missing, replace=True)
            additional_mask = additional_mol1 != additional_mol2
            mol1 = np.concatenate([mol1, additional_mol1[additional_mask]])
            mol2 = np.concatenate([mol2, additional_mol2[additional_mask]])
            missing = num_to_sample - len(mol1)

        # Create a DataFrame of sampled negative pairs
        sampled_df = pd.DataFrame({
            'mol1': mol1,
            'mol2': mol2,
            'rule': rule
        })

        # Append to negative_pairs
        negative_pairs.append(sampled_df)

    # Concatenate all sampled pairs into a single DataFrame
    negative_pairs_df = pd.concat(negative_pairs, ignore_index=True)

    # Shuffle the DataFrame to ensure randomness
    negative_pairs_df = negative_pairs_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    # Limit to MAX_PAIRS if necessary
    if len(negative_pairs_df) > MAX_PAIRS:
        negative_pairs_df = negative_pairs_df.iloc[:MAX_PAIRS]

    print(f"Generated {len(negative_pairs_df):,} sampled negative pairs.")

# Final DataFrame: negative_pairs_df
print(f"Final number of negative pairs: {len(negative_pairs_df):,}")

# Display a preview
print(negative_pairs_df.head())


Calculating possible negative pairs per rule...


Calculating Pairs: 100%|██████████| 56/56 [00:07<00:00,  7.49it/s]


Total possible negative pairs across all rules: 10,862,728,260,242
Total possible pairs exceed the desired maximum. Sampling required...
Sampling fraction: 0.000000
Total sampled pairs after adjustment: 99,975
Sampling negative pairs per rule...


Sampling Pairs: 100%|██████████| 56/56 [00:06<00:00,  8.05it/s]

Generated 99,975 sampled negative pairs.
Final number of negative pairs: 99,975
                                                mol1  \
0  O=C(NS(=O)(=O)c1nc2ccccc2s1)c1ccc(N2CCN(Cc3ccc...   
1                COc1ccc(NC(=O)c2ccc3c(c2)OCO3)cc1Cl   
2                     CCn1c2ccccc2c2cc(CC(N)=O)ccc21   
3  CCOC(=O)C1=NN(c2ccc(F)cc2)C2=NC(C)=C(C(=O)OC)C...   
4                       N[C@H]1C[C@@H]1c1ccc(Br)cc1F   

                                                mol2  \
0          CCOCc1nc2c(c(NCCNC(C)=O)n1)CCN(C(C)=O)CC2   
1               COc1ccc(C(=O)C(Cn2ccnc2)Cn2ccnc2)cc1   
2  COc1ccc(C[C@H](NC(=O)[C@@H]2CSCC[C@H](NC(C)=O)...   
3  CC(C)=CC[C@]12C[C@H]3C[C@H]4C(C)(C)O[C@@H](C=C...   
4  CCCC1N(CC2CCCCC2)C(=O)OC12CCN(C1CCN(C(=O)c3c(C...   

                                                rule  
0  [Cl:5][S$(S(=O)(=O)(Cl)([CX4])):2](=[O:3])=[O:...  
1  [S;$(S(=O)(=O)[C,N]):1][Cl].[N;$(NC);!$(N=*);!...  
2  [OH:7]-[c:6]1[cH:1][c:2][c:3][c:4][c:5]1.[O$(O...  
3  [#6:1][C:2]#[#7;D1].[Cl




In [20]:
negative_pairs_df.to_csv("neg_pairs_test_new.csv")
