# PPI-Inhibitors Dataset Verification Test Notebook

This notebook verifies the specific dataset details described in the research paper and documentation.

## Expected Results Summary:

### External Datasets:
1. **2dyh Dataset**: 72 examples (24 positive, 24 negative same-complex, 24 negative cross-complex)
2. **6m0j Dataset**: 72 examples (24 positive, 24 negative same-complex, 24 negative cross-complex)
3. **External1.txt**: ~21 examples

### Primary Training Dataset:
- **Positive examples**: 714-857 inhibitors
- **Protein complexes**: 22
- **Total examples**: ~15,695

### PDB Files:
- **2dyh.pdb**: ~264 KB
- **6m0j.pdb**: ~584 KB

### Model Features:
- **Ligand features**: 2,048 dimensions
- **Sequence features**: 69 dimensions
- **Interface features**: 211 dimensions
- **GNN features**: 512 dimensions
- **Total**: 2,840 dimensions

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Define paths
base_path = Path('.')
data_path = base_path / 'Data'
external_data_path = data_path / 'External data'
pdb_path = external_data_path / 'pdb'

print("=" * 100)
print("PPI-INHIBITORS DATASET VERIFICATION TEST")
print("=" * 100)

## 1. External Dataset 1: MDM2-p53 Complex (2dyh)

In [None]:
# Load 2dyh dataset
file_2dyh = external_data_path / '2dyh_all_External_All_Examples.txt'

print("\n" + "=" * 100)
print("DATASET 1: MDM2-p53 Complex (2dyh)")
print("=" * 100)
print(f"File: {file_2dyh.name}")
print(f"File exists: {file_2dyh.exists()}")

if file_2dyh.exists():
    # Read the file
    with open(file_2dyh, 'r') as f:
        lines_2dyh = f.readlines()
    
    # Parse data
    data_2dyh = []
    for line in lines_2dyh:
        parts = line.strip().split()
        if len(parts) >= 3:
            complex_id = parts[0]
            label = float(parts[-1])
            smiles = ' '.join(parts[1:-1])
            data_2dyh.append({'complex': complex_id, 'smiles': smiles, 'label': label})
    
    df_2dyh = pd.DataFrame(data_2dyh)
    
    # Count statistics
    total_examples = len(df_2dyh)
    positive_2dyh = len(df_2dyh[(df_2dyh['complex'] == '2dyh') & (df_2dyh['label'] == 1.0)])
    negative_2dyh_same = len(df_2dyh[(df_2dyh['complex'] == '2dyh') & (df_2dyh['label'] == -1.0)])
    negative_other = len(df_2dyh[(df_2dyh['complex'] != '2dyh') & (df_2dyh['label'] == -1.0)])
    unique_other_complexes = df_2dyh[df_2dyh['complex'] != '2dyh']['complex'].nunique()
    
    print(f"\nTotal examples: {total_examples}")
    print(f"\nBreakdown:")
    print(f"  ✓ Positive (2dyh, label=1.0):           {positive_2dyh}")
    print(f"  ✓ Negative same-complex (2dyh, -1.0):   {negative_2dyh_same}")
    print(f"  ✓ Negative cross-complex (other, -1.0): {negative_other}")
    print(f"  ✓ Number of other complexes used:       {unique_other_complexes}")
    
    # Verification
    print(f"\n{'✓ PASS' if total_examples == 72 else '✗ FAIL'}: Total should be 72 (got {total_examples})")
    print(f"{'✓ PASS' if positive_2dyh == 24 else '✗ FAIL'}: Positives should be 24 (got {positive_2dyh})")
    print(f"{'✓ PASS' if negative_2dyh_same == 24 else '✗ FAIL'}: Same-complex negatives should be 24 (got {negative_2dyh_same})")
    print(f"{'✓ PASS' if negative_other == 24 else '✗ FAIL'}: Cross-complex negatives should be 24 (got {negative_other})")
    
    # Show other complexes used
    other_complexes = df_2dyh[df_2dyh['complex'] != '2dyh']['complex'].value_counts()
    print(f"\nOther complexes used as negatives:")
    print(other_complexes.to_string())
else:
    print("✗ FILE NOT FOUND!")

## 2. External Dataset 2: SARS-CoV-2 Spike/ACE2 Complex (6m0j)

In [None]:
# Load 6m0j dataset
file_6m0j = external_data_path / 'HansonACE2hits_External_All_Examples.txt'

print("\n" + "=" * 100)
print("DATASET 2: SARS-CoV-2 Spike/ACE2 Complex (6m0j)")
print("=" * 100)
print(f"File: {file_6m0j.name}")
print(f"File exists: {file_6m0j.exists()}")

if file_6m0j.exists():
    # Read the file
    with open(file_6m0j, 'r') as f:
        lines_6m0j = f.readlines()
    
    # Parse data
    data_6m0j = []
    for line in lines_6m0j:
        parts = line.strip().split()
        if len(parts) >= 3:
            complex_id = parts[0]
            label = float(parts[-1])
            smiles = ' '.join(parts[1:-1])
            data_6m0j.append({'complex': complex_id, 'smiles': smiles, 'label': label})
    
    df_6m0j = pd.DataFrame(data_6m0j)
    
    # Count statistics
    total_examples = len(df_6m0j)
    positive_6m0j = len(df_6m0j[(df_6m0j['complex'] == '6m0j') & (df_6m0j['label'] == 1.0)])
    negative_6m0j_same = len(df_6m0j[(df_6m0j['complex'] == '6m0j') & (df_6m0j['label'] == -1.0)])
    negative_other = len(df_6m0j[(df_6m0j['complex'] != '6m0j') & (df_6m0j['label'] == -1.0)])
    unique_other_complexes = df_6m0j[df_6m0j['complex'] != '6m0j']['complex'].nunique()
    
    print(f"\nTotal examples: {total_examples}")
    print(f"\nBreakdown:")
    print(f"  ✓ Positive (6m0j, label=1.0):           {positive_6m0j}")
    print(f"  ✓ Negative same-complex (6m0j, -1.0):   {negative_6m0j_same}")
    print(f"  ✓ Negative cross-complex (other, -1.0): {negative_other}")
    print(f"  ✓ Number of other complexes used:       {unique_other_complexes}")
    
    # Verification
    print(f"\n{'✓ PASS' if total_examples == 72 else '✗ FAIL'}: Total should be 72 (got {total_examples})")
    print(f"{'✓ PASS' if positive_6m0j == 24 else '✗ FAIL'}: Positives should be 24 (got {positive_6m0j})")
    print(f"{'✓ PASS' if negative_6m0j_same == 24 else '✗ FAIL'}: Same-complex negatives should be 24 (got {negative_6m0j_same})")
    print(f"{'✓ PASS' if negative_other == 24 else '✗ FAIL'}: Cross-complex negatives should be 24 (got {negative_other})")
    
    # Show sample positive compounds
    print(f"\nSample positive compounds (putative SARS-CoV-2 inhibitors):")
    positive_samples = df_6m0j[(df_6m0j['complex'] == '6m0j') & (df_6m0j['label'] == 1.0)].head(5)
    for idx, row in positive_samples.iterrows():
        print(f"  - {row['smiles'][:80]}...")
else:
    print("✗ FILE NOT FOUND!")

## 3. External Dataset 3: External1.txt

In [None]:
# Load External1.txt
file_external1 = external_data_path / 'External1.txt'

print("\n" + "=" * 100)
print("DATASET 3: External1.txt (Curated subset)")
print("=" * 100)
print(f"File: {file_external1.name}")
print(f"File exists: {file_external1.exists()}")

if file_external1.exists():
    with open(file_external1, 'r') as f:
        lines_ext1 = f.readlines()
    
    total_examples = len(lines_ext1)
    
    # Parse to check complexes
    complexes_ext1 = []
    for line in lines_ext1:
        parts = line.strip().split()
        if len(parts) >= 1:
            complexes_ext1.append(parts[0])
    
    unique_complexes = set(complexes_ext1)
    
    print(f"\nTotal examples: {total_examples}")
    print(f"Unique complexes: {unique_complexes}")
    
    # Verification
    print(f"\n{'✓ PASS' if total_examples == 21 else '✗ FAIL'}: Total should be ~21 (got {total_examples})")
    print(f"\nDescription: Curated subset of 2dyh inhibitors for focused testing")
else:
    print("✗ FILE NOT FOUND!")

## 4. PDB Structure Files

In [None]:
print("\n" + "=" * 100)
print("PDB STRUCTURE FILES")
print("=" * 100)

pdb_files = [
    ('2dyh.pdb', 264),  # Expected size in KB
    ('6m0j.pdb', 584)
]

for pdb_name, expected_kb in pdb_files:
    pdb_file = pdb_path / pdb_name
    print(f"\nFile: {pdb_name}")
    print(f"  Exists: {pdb_file.exists()}")
    
    if pdb_file.exists():
        size_bytes = os.path.getsize(pdb_file)
        size_kb = size_bytes / 1024
        print(f"  Size: {size_kb:.2f} KB")
        
        # Check if size is within reasonable range (±20%)
        lower_bound = expected_kb * 0.8
        upper_bound = expected_kb * 1.2
        within_range = lower_bound <= size_kb <= upper_bound
        
        print(f"  {'✓ PASS' if within_range else '✗ FAIL'}: Expected ~{expected_kb} KB (got {size_kb:.2f} KB)")
    else:
        print("  ✗ FILE NOT FOUND!")

## 5. Primary Training Dataset Analysis

In [None]:
# Load main training dataset
train_file = data_path / 'WriteAllexamplesRandomBindersIdsAll_24JAN_Binary.txt'

print("\n" + "=" * 100)
print("PRIMARY TRAINING DATASET")
print("=" * 100)
print(f"File: {train_file.name}")
print(f"File exists: {train_file.exists()}")

if train_file.exists():
    # Read the file
    with open(train_file, 'r') as f:
        lines_train = f.readlines()
    
    # Parse data
    train_data = []
    for line in lines_train:
        parts = line.strip().split()
        if len(parts) >= 4:
            root_complex = parts[0]
            target_complex = parts[1]
            compound = parts[2] if len(parts) == 4 else ' '.join(parts[2:-1])
            label = float(parts[-1])
            
            # Extract base complex name
            base_complex = target_complex.split('_')[0]
            
            train_data.append({
                'root_complex': root_complex,
                'target_complex': target_complex,
                'base_complex': base_complex,
                'compound': compound,
                'label': label
            })
    
    df_train = pd.DataFrame(train_data)
    
    # Statistics
    total_examples = len(df_train)
    positive_examples = len(df_train[df_train['label'] == 1.0])
    negative_examples = len(df_train[df_train['label'] == 0.0])
    unique_complexes = df_train['base_complex'].nunique()
    unique_compounds = df_train['compound'].nunique()
    
    print(f"\nDataset Statistics:")
    print(f"  Total examples: {total_examples:,}")
    print(f"  Positive examples (inhibitors): {positive_examples:,}")
    print(f"  Negative examples: {negative_examples:,}")
    print(f"  Unique protein complexes: {unique_complexes}")
    print(f"  Unique compounds: {unique_compounds:,}")
    print(f"  Positive:Negative ratio: 1:{negative_examples/positive_examples:.1f}")
    
    # Verification
    print(f"\nVerification:")
    print(f"  {'✓ PASS' if unique_complexes == 22 else '✗ FAIL'}: Should have 22 unique complexes (got {unique_complexes})")
    print(f"  {'✓ PASS' if 714 <= positive_examples <= 857 else '✗ FAIL'}: Should have 714-857 positive examples (got {positive_examples})")
    print(f"  {'✓ INFO' if True else ''}: Total negative examples: {negative_examples:,}")
    
    # Show complex distribution
    print(f"\nComplex Distribution (positive examples):")
    complex_dist = df_train[df_train['label'] == 1.0]['base_complex'].value_counts().sort_index()
    print(complex_dist.to_string())
    
    # Label distribution
    print(f"\nLabel Distribution:")
    label_counts = df_train['label'].value_counts().sort_index()
    for label, count in label_counts.items():
        percentage = (count / total_examples) * 100
        print(f"  Label {label}: {count:,} ({percentage:.2f}%)")
else:
    print("✗ FILE NOT FOUND!")

## 6. Model Feature Dimensions Verification

In [None]:
print("\n" + "=" * 100)
print("MODEL FEATURE DIMENSIONS")
print("=" * 100)

# Define expected feature dimensions
features = {
    'Ligand Features (ECFP)': 2048,
    'Protein Sequence Features': 69,
    'Interface Features': 211,
    'GNN Features': 512,
}

total_expected = sum(features.values())

print("\nFeature Breakdown:")
for feature_name, dims in features.items():
    print(f"  {feature_name:<35} {dims:>5} dimensions")

print(f"\n  {'─' * 42}")
print(f"  {'TOTAL':<35} {total_expected:>5} dimensions")

# Verification
print(f"\nVerification:")
print(f"  {'✓ PASS' if total_expected == 2840 else '✗ FAIL'}: Total should be 2,840 dimensions (got {total_expected})")

# Additional details
print(f"\nFeature Details:")
print(f"  - Ligand: Extended-Connectivity Fingerprint (ECFP/Morgan) with radius=2")
print(f"  - Sequence: AAC (20) + Grouped k-mer composition k=2 (49)")
print(f"  - Interface: Amino acid pair frequencies at PPI interface (≤8Å)")
print(f"  - GNN: 3-layer GNN (512→1024→512) with 10 neighbors per residue")

## 7. Model Architecture Verification

In [None]:
print("\n" + "=" * 100)
print("MODEL ARCHITECTURE")
print("=" * 100)

architecture = {
    'GNN Layers': '3 layers (512 → 1,024 → 512)',
    'Neighbors per atom': '10 same-residue + 10 different-residue',
    'GNN Activation': 'ReLU',
    'MLP Input': '2,840 dimensions (concatenated features)',
    'MLP Hidden Layers': '2 layers (1,024 → 512 → 100)',
    'MLP Activations': 'tanh (first two) + ReLU (third)',
    'Output': '1 neuron (binary classification)',
    'Loss Function': 'Binary Cross Entropy (weighted)',
    'Optimizer': 'Adam (lr=0.0001)',
}

print("\nArchitecture Details:")
for component, detail in architecture.items():
    print(f"  {component:<25} {detail}")

print("\nGraph Representation:")
print(f"  - Nodes: Atoms in protein complex")
print(f"  - Edges: Atomic contacts (distance < 6Å)")
print(f"  - Node features: One-hot encoding (13 atom types + 21 amino acids)")

## 8. Expected Model Performance

In [None]:
print("\n" + "=" * 100)
print("EXPECTED MODEL PERFORMANCE")
print("=" * 100)

results = [
    ('Leave-One-Complex-Out (LOCO)', '0.86', '0.39'),
    ('External Validation (2dyh)', '0.82', 'N/A'),
    ('SARS-CoV-2 Test (6m0j)', '0.78', 'N/A'),
]

print("\nProposed GNN Model:")
print(f"  {'Evaluation Method':<40} {'AUC-ROC':<12} {'AUC-PR'}")
print(f"  {'─' * 65}")
for method, aucroc, aucpr in results:
    print(f"  {method:<40} {aucroc:<12} {aucpr}")

print("\nBaseline Comparisons:")
comparisons = [
    ('SVM (kernel-based)', '0.74'),
    ('GearNet-Edge (pre-trained)', '0.79'),
    ('Proposed GNN', '0.86 (best)'),
]

for model, aucroc in comparisons:
    print(f"  {model:<35} AUC-ROC: {aucroc}")

## 9. Summary and Final Verification

In [None]:
print("\n" + "=" * 100)
print("SUMMARY - VERIFICATION CHECKLIST")
print("=" * 100)

checklist = [
    "2dyh dataset has 72 examples (24+24+24)",
    "6m0j dataset has 72 examples (24+24+24)",
    "External1.txt has ~21 examples",
    "2dyh.pdb size is ~264 KB",
    "6m0j.pdb size is ~584 KB",
    "Primary dataset has 22 unique complexes",
    "Primary dataset has 714-857 positive examples",
    "Total feature dimensions = 2,840",
    "Ligand features = 2,048 dimensions",
    "Sequence features = 69 dimensions",
    "Interface features = 211 dimensions",
    "GNN features = 512 dimensions",
    "Expected LOCO AUC-ROC: 0.86",
    "Expected External AUC-ROC: 0.82 (2dyh)",
    "Expected COVID-19 AUC-ROC: 0.78 (6m0j)",
]

print("\nAll Checks:")
for i, check in enumerate(checklist, 1):
    print(f"  {i:2d}. {check}")

print("\n" + "=" * 100)
print("VERIFICATION COMPLETE")
print("=" * 100)
print("\nRun all cells above to verify each aspect of the dataset and model.")
print("All checks should show ✓ PASS if the data matches the research paper description.")