# Pathogenicity Prediction Benchmark

This notebook demonstrates how to use the pathogenicity prediction benchmark to compare masked nucleotide prediction across different genomic foundation models.

The benchmark evaluates how well models can distinguish pathogenic from benign variants based on their raw nucleotide predictions.

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from genebeddings.benchmarks.pathogenicity_predictions import PathogenicityBenchmark

# Import model wrappers that support masked nucleotide prediction
from genebeddings.wrappers.nt_wrapper import NTWrapper
from genebeddings.wrappers.caduceus_wrapper import CaduceusWrapper
from genebeddings.wrappers.specieslm_wrapper import SpeciesLMWrapper
from genebeddings.wrappers.convnova_wrapper import ConvNOVAWrapper
from genebeddings.wrappers.rinalmo_wrapper import RiNALMoWrapper

## Initialize Benchmark

Provide the path to your ClinVar CSV file. The CSV should contain:
- `chrom`: Chromosome (e.g., 'chr1', '1')
- `pos`: Genomic position (1-based)
- `ref`: Reference allele (single nucleotide)
- `alt`: Alternate allele (single nucleotide)
- `clinical_significance`: Pathogenic/Benign classification

In [None]:
# Path to ClinVar CSV
CLINVAR_CSV = "path/to/clinvar.csv"

# Initialize benchmark
benchmark = PathogenicityBenchmark(
    clinvar_csv_path=CLINVAR_CSV,
    fasta_reference="hg38",
    context_size=512  # bases on each side of variant
)

## Add Models to Compare

Add the models you want to benchmark. Each model must support masked nucleotide prediction.

In [None]:
# Add Nucleotide Transformer
print("Loading Nucleotide Transformer...")
nt_wrapper = NTWrapper(
    model_id="InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
)
benchmark.add_model("nt", nt_wrapper)

In [None]:
# Add Caduceus
print("Loading Caduceus...")
caduceus_wrapper = CaduceusWrapper(
    model_id="kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
)
benchmark.add_model("caduceus", caduceus_wrapper)

In [None]:
# Optional: Add more models
# benchmark.add_model("specieslm", SpeciesLMWrapper())
# benchmark.add_model("convnova", ConvNOVAWrapper())
# benchmark.add_model("rinalmo", RiNALMoWrapper())

## Run Benchmark

Run the benchmark on all variants. Use `max_variants` to test on a subset first.

In [None]:
# Run on a small subset first for testing
results_df = benchmark.run_benchmark(
    max_variants=100,      # Set to None to use all variants
    balance_classes=True   # Balance pathogenic and benign variants
)

## Analyze Results

In [None]:
# View first few results
results_df.head()

In [None]:
# Compute summary statistics
analysis = benchmark.analyze_results(results_df)
benchmark.print_summary(analysis)

## Visualize Results

In [None]:
# Plot probability ratio distributions for each model
model_names = [col.replace('_prob_ratio', '') for col in results_df.columns if col.endswith('_prob_ratio')]

fig, axes = plt.subplots(1, len(model_names), figsize=(6*len(model_names), 5))
if len(model_names) == 1:
    axes = [axes]

for ax, model_name in zip(axes, model_names):
    ratio_col = f'{model_name}_prob_ratio'
    
    # Split by pathogenic vs benign
    pathogenic = results_df[results_df['label'] == 'pathogenic'][ratio_col].dropna()
    benign = results_df[results_df['label'] == 'benign'][ratio_col].dropna()
    
    # Plot distributions
    ax.hist(pathogenic, bins=30, alpha=0.5, label='Pathogenic', color='red', density=True)
    ax.hist(benign, bins=30, alpha=0.5, label='Benign', color='blue', density=True)
    
    ax.axvline(1.0, color='black', linestyle='--', linewidth=1, alpha=0.5, label='Ratio = 1')
    ax.set_xlabel('Alt Prob / Ref Prob')
    ax.set_ylabel('Density')
    ax.set_title(f'{model_name.upper()}\nProbability Ratios')
    ax.legend()
    ax.set_xlim(0, 3)

plt.tight_layout()
plt.show()

In [None]:
# Compare models side-by-side
comparison_data = []
for model_name in model_names:
    ratio_col = f'{model_name}_prob_ratio'
    
    for label in ['pathogenic', 'benign']:
        ratios = results_df[results_df['label'] == label][ratio_col].dropna()
        for ratio in ratios:
            comparison_data.append({
                'model': model_name,
                'label': label,
                'prob_ratio': ratio
            })

comparison_df = pd.DataFrame(comparison_data)

plt.figure(figsize=(10, 6))
sns.boxplot(data=comparison_df, x='model', y='prob_ratio', hue='label')
plt.axhline(1.0, color='black', linestyle='--', linewidth=1, alpha=0.5)
plt.ylabel('Alt Prob / Ref Prob')
plt.xlabel('Model')
plt.title('Pathogenic vs Benign: Probability Ratios Across Models')
plt.ylim(0, 2.5)
plt.legend(title='Variant Type')
plt.tight_layout()
plt.show()

## Model Consensus Analysis

Check how often models agree on predictions

In [None]:
# For each variant, check if models predict alt_prob < ref_prob (deleterious)
if len(model_names) > 1:
    for model_name in model_names:
        results_df[f'{model_name}_predicts_deleterious'] = (
            results_df[f'{model_name}_prob_ratio'] < 1.0
        )
    
    # Count agreements
    deleterious_cols = [f'{m}_predicts_deleterious' for m in model_names]
    results_df['n_models_agree_deleterious'] = results_df[deleterious_cols].sum(axis=1)
    
    # Show consensus
    print("\nModel Consensus:")
    print("="*50)
    consensus_summary = results_df.groupby(['label', 'n_models_agree_deleterious']).size().unstack(fill_value=0)
    print(consensus_summary)
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    for ax, label in zip(axes, ['pathogenic', 'benign']):
        subset = results_df[results_df['label'] == label]
        counts = subset['n_models_agree_deleterious'].value_counts().sort_index()
        
        ax.bar(counts.index, counts.values, color='red' if label == 'pathogenic' else 'blue', alpha=0.6)
        ax.set_xlabel('Number of Models Predicting Deleterious')
        ax.set_ylabel('Count')
        ax.set_title(f'{label.capitalize()} Variants')
        ax.set_xticks(range(len(model_names) + 1))
    
    plt.tight_layout()
    plt.show()

## Save Results

In [None]:
# Save detailed results
results_df.to_csv("pathogenicity_benchmark_results.csv", index=False)
print("Results saved to: pathogenicity_benchmark_results.csv")

# Save summary statistics
summary_rows = []
for model_name, stats in analysis.items():
    summary_rows.append({
        'model': model_name,
        'pathogenic_mean_ref_prob': stats['pathogenic']['mean_ref_prob'],
        'pathogenic_mean_alt_prob': stats['pathogenic']['mean_alt_prob'],
        'pathogenic_mean_prob_ratio': stats['pathogenic']['mean_prob_ratio'],
        'benign_mean_ref_prob': stats['benign']['mean_ref_prob'],
        'benign_mean_alt_prob': stats['benign']['mean_alt_prob'],
        'benign_mean_prob_ratio': stats['benign']['mean_prob_ratio'],
        'cohens_d': stats.get('cohens_d', np.nan)
    })

summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("pathogenicity_benchmark_summary.csv", index=False)
print("Summary saved to: pathogenicity_benchmark_summary.csv")

## Interpretation

**Expected patterns:**
- **Pathogenic variants**: Models should assign lower probability to the alternate allele compared to the reference (prob_ratio < 1.0), indicating the mutation is deleterious
- **Benign variants**: Models should assign similar or higher probability to the alternate allele (prob_ratio >= 1.0), indicating the mutation is neutral or beneficial

**Metrics:**
- **Cohen's d**: Measures how well the model discriminates between pathogenic and benign. Positive values indicate better discrimination.
- **Probability ratio**: alt_prob / ref_prob. Values < 1 suggest deleterious, values >= 1 suggest neutral/beneficial.