In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("lit-pcba_all_data_no_canoncalization.csv")
df.head()

  df = pd.read_csv("lit-pcba_all_data_no_canonicalization.csv")


Unnamed: 0,receptor,mol_id,smiles,type
0,ADRB2,852502,CC(Nc1nc(n2c(C)cc(C)n2)nc(NC(C)C)n1)C,active_T
1,ADRB2,855847,CC[C@H]([C@H](c1c(c2c(O)cc1)ccc(O)n2)O)[NH2+]C...,active_T
2,ADRB2,7976579,CCNc1nc(n2c(C)cc(C)n2)nc(NCC)n1,active_T
3,ADRB2,11532855,CCC[NH2+][C@@H](C(Nc1c(C)cccc1)=O)C,active_T
4,ADRB2,11532990,COc1ccc([C@@H]([NH2+]C[C@@H](c2cc(NC=O)c(O)cc2...,active_T


In [4]:
import collections

# Initialize sets to track stats of overlaps and repeats across all receptors
overall_queries_overlap = set()  # Query molecules that overlap with train/val sets
overall_queries_overlap_val = set()  # Query molecules that overlap with validation set
overall_train_val_overlaps = {"active": set(), "inactive": set()}  # Molecules in both train and val sets
overall_repeats = {group: set() for group in ["queries", "active_T", "active_V", "inactive_T", "inactive_V"]}  # Repeated molecules by group

# Process each receptor separately
receptors = df["receptor"].unique()
for receptor in receptors:
    found_any = False  # Track if any overlaps/repeats found for this receptor
    sub = df[df["receptor"] == receptor].copy()
    # Map type to group name (queries for query type)
    sub["group"] = sub["type"].apply(lambda t: "queries" if t == "query" else t)

    # Group by SMILES to find overlaps
    grouped = sub.groupby("smiles")["group"].agg(lambda x: set(x.dropna()))
    grouped_type = sub.groupby("smiles")["type"].agg(lambda x: set(x.dropna()))

    # Find query molecules that overlap with train/val sets
    queries_overlap = []
    queries_overlap_val = []
    queries_overlap_examples = []
    for smi, groups in grouped.items():
        if "queries" in groups and (("active_T" in groups) or ("active_V" in groups) or ("inactive_T" in groups) or ("inactive_V" in groups)):
            queries_overlap.append(smi)
            overall_queries_overlap.add(smi)
            if ("active_V" in groups) or ("inactive_V" in groups):
                queries_overlap_val.append(smi)
                overall_queries_overlap_val.add(smi)
            # Store example rows for reporting
            rows = sub[sub["smiles"] == smi]
            example = {
                "smi": smi,
                "rows": [
                    {
                        "group": row['group'],
                        "type": row['type'],
                        "mol_id": row['mol_id']
                    }
                    for _, row in rows.iterrows()
                ]
            }
            queries_overlap_examples.append(example)

    # Find molecules that overlap between train and validation sets
    train_val_overlaps = {}
    train_val_examples = {}
    for label in ["active", "inactive"]:
        t_type = f"{label}_T"
        v_type = f"{label}_V"
        t_smiles = set(sub[sub["type"] == t_type]["smiles"].dropna())
        v_smiles = set(sub[sub["type"] == v_type]["smiles"].dropna())
        overlap = t_smiles & v_smiles
        train_val_overlaps[label] = overlap
        overall_train_val_overlaps[label].update(overlap)
        train_val_examples[label] = []
        if overlap:
            # Store example rows for reporting
            for smi in overlap:
                rows = sub[sub["smiles"] == smi]
                example = {
                    "smi": smi,
                    "rows": [
                        {
                            "group": row['group'],
                            "type": row['type'],
                            "mol_id": row['mol_id']
                        }
                        for _, row in rows.iterrows()
                    ]
                }
                train_val_examples[label].append(example)

    # Find repeated molecules within each group
    repeat_types = [
        ("query", "queries"),
        ("active_T", "active_T"),
        ("active_V", "active_V"), 
        ("inactive_T", "inactive_T"),
        ("inactive_V", "inactive_V"),
    ]
    repeats = {}
    repeat_examples = {}
    for t, group_name in repeat_types:
        smiles_list = sub[sub["type"] == t]["smiles"].dropna()
        counter = collections.Counter(smiles_list)
        repeated = [smi for smi, count in counter.items() if count > 1]
        repeats[group_name] = repeated
        overall_repeats[group_name].update(repeated)
        repeat_examples[group_name] = []
        # Store example rows for reporting (up to 3)
        for smi in repeated[:3]:
            rows = sub[(sub["type"] == t) & (sub["smiles"] == smi)]
            example = {
                "smi": smi,
                "rows": [
                    {
                        "group": row['group'],
                        "type": row['type'],
                        "mol_id": row['mol_id']
                    }
                    for _, row in rows.iterrows()
                ]
            }
            repeat_examples[group_name].append(example)

    # Generate report for this receptor
    output_lines = []
    if queries_overlap_val:
        found_any = True
        output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Outrageous SMILES Overlaps")
        output_lines.append("-"*80)
        output_lines.append(f"For receptor {receptor}, the most outrageous thing happened: {len(queries_overlap_val)} query molecule(s) are also present in the validation set active_V!")
        output_lines.append("  Example(s) of query/active_V overlap (top 3):")
        for example in queries_overlap_examples[:3]:
            output_lines.append(f"    SMILES: {example['smi']}")
            for row in example['rows']:
                output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
            output_lines.append("")
    elif queries_overlap:
        found_any = True
        output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Outrageous SMILES Overlaps")
        output_lines.append("-"*80)
        output_lines.append(f"For receptor {receptor}, {len(queries_overlap)} query molecule(s) are also present in the training set active_T.")
        output_lines.append("  Example(s) of query/active_T overlap (top 3):")
        for example in queries_overlap_examples[:3]:
            output_lines.append(f"    SMILES: {example['smi']}")
            for row in example['rows']:
                output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
            output_lines.append("")

    # Report train/val overlaps
    for label in ["active", "inactive"]:
        overlap = train_val_overlaps[label]
        if overlap:
            if not found_any:
                output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Outrageous SMILES Overlaps")
                output_lines.append("-"*80)
                found_any = True
            output_lines.append(f"For receptor {receptor}, {len(overlap)} {label} molecule(s) are in both training and validation sets.")
            output_lines.append(f"  Example(s) of {label}_T/{label}_V overlap (top 3):")
            for example in train_val_examples[label][:3]:
                output_lines.append(f"    SMILES: {example['smi']}")
                for row in example['rows']:
                    output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
                output_lines.append("")

    # Report repeats
    for t, group_name in repeat_types:
        repeated = repeats[group_name]
        if repeated:
            if not found_any:
                output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Outrageous SMILES Overlaps")
                output_lines.append("-"*80)
                found_any = True
            output_lines.append(f"For receptor {receptor}, {len(repeated)} repeating {group_name} molecule(s) found in the set.")
            output_lines.append(f"  Example(s) of repeating {group_name} (top 3):")
            for example in repeat_examples[group_name]:
                output_lines.append(f"    SMILES: {example['smi']}")
                for row in example['rows']:
                    output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
                output_lines.append("")

    # Print report for this receptor if any issues found
    if found_any:
        output_lines.append("-"*80 + "\n")
        print("\n".join(output_lines))

# Check if any issues found across all receptors
overall_found = (
    len(overall_queries_overlap) > 0 or
    len(overall_queries_overlap_val) > 0 or
    len(overall_train_val_overlaps['active']) > 0 or
    len(overall_train_val_overlaps['inactive']) > 0 or
    any(len(overall_repeats[group]) > 0 for group in ['queries', 'active_T', 'inactive_T', 'active_V', 'inactive_V'])
)

# Print overall statistics if any issues found
if overall_found:
    print("\n" + "="*80)
    print("OVERALL OUTRAGEOUS STATS ACROSS ALL RECEPTORS")
    print("-"*80)
    if len(overall_queries_overlap) > 0:
        print(f"Number of query molecules also in training set (overall): {len(overall_queries_overlap)}")
    if len(overall_queries_overlap_val) > 0:
        print(f"Number of query molecules also in validation set (overall): {len(overall_queries_overlap_val)}")
    if len(overall_train_val_overlaps['active']) > 0:
        print(f"Number of active molecules in both train and val (overall): {len(overall_train_val_overlaps['active'])}")
    if len(overall_train_val_overlaps['inactive']) > 0:
        print(f"Number of inactive molecules in both train and val (overall): {len(overall_train_val_overlaps['inactive'])}")
    for group in ['queries', 'active_T', 'inactive_T', 'active_V', 'inactive_V']:
        if len(overall_repeats[group]) > 0:
            print(f"Number of repeating molecules in {group} (overall): {len(overall_repeats[group])}")
    print("="*80 + "\n")



[Receptor: ADRB2] Outrageous SMILES Overlaps
--------------------------------------------------------------------------------
For receptor ADRB2, 1 repeating queries molecule(s) found in the set.
  Example(s) of repeating queries (top 3):
    SMILES: Cc1ccccc1CC(C)(C)NC[C@@H](c2ccc(c3c2OCC(=O)N3)O)O
      group: queries, type: query, mol_id: 3p0g
      group: queries, type: query, mol_id: 3sn6
      group: queries, type: query, mol_id: 4lde

--------------------------------------------------------------------------------


[Receptor: MTORC1] Outrageous SMILES Overlaps
--------------------------------------------------------------------------------
For receptor MTORC1, 3 repeating queries molecule(s) found in the set.
  Example(s) of repeating queries (top 3):
    SMILES: C[C@@H]1CC[C@H]2C[C@@H](C(=CC=C\C=C\[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)O)C)/C)O)OC)C)C)C)OC
      group: queries, type