# Norovirus Receptor Discovery: Results Analysis

This notebook demonstrates how to analyze results from the receptor discovery pipeline.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Define paths
RESULTS_DIR = Path("../results")
DATA_DIR = Path("../data")

## 1. Load Results

In [None]:
# Load ranked candidates
with open(RESULTS_DIR / "ranked_candidates.json") as f:
    candidates = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(candidates)

print(f"Total candidates: {len(df)}")
print(f"High confidence: {(df['confidence_tier'] == 'high').sum()}")
print(f"Medium confidence: {(df['confidence_tier'] == 'medium').sum()}")
print(f"Low confidence: {(df['confidence_tier'] == 'low').sum()}")

df.head()

## 2. Top Candidates

In [None]:
# Show top 20 candidates
top_20 = df.head(20)[[
    'rank', 'gene_name', 'protein_id', 
    'overall_score', 'ipTM_score', 
    'structural_confidence', 'biological_relevance',
    'confidence_tier'
]]

top_20

## 3. High Confidence Candidates (ipTM > 0.8)

In [None]:
# Filter for high ipTM
high_iptm = df[df['ipTM_score'] > 0.8].sort_values('overall_score', ascending=False)

print(f"\nCandidates with ipTM > 0.8: {len(high_iptm)}\n")

if len(high_iptm) > 0:
    for idx, row in high_iptm.head(10).iterrows():
        print(f"{row['rank']}. {row['gene_name']} ({row['protein_id']})")
        print(f"   Overall Score: {row['overall_score']:.3f}")
        print(f"   ipTM: {row['ipTM_score']:.3f}")
        print(f"   Interface pLDDT: {row['interface_pLDDT_score']*100:.1f}")
        print(f"   Structural: {row['structural_confidence']:.3f}")
        print(f"   Biological: {row['biological_relevance']:.3f}")
        print()

## 4. Score Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Overall score
axes[0, 0].hist(df['overall_score'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df['overall_score'].median(), color='red', linestyle='--', label='Median')
axes[0, 0].set_xlabel('Overall Score')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Overall Score Distribution')
axes[0, 0].legend()

# ipTM score
axes[0, 1].hist(df['ipTM_score'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].axvline(df['ipTM_score'].median(), color='red', linestyle='--', label='Median')
axes[0, 1].axvline(0.8, color='blue', linestyle='--', label='High threshold')
axes[0, 1].set_xlabel('ipTM Score')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('ipTM Score Distribution')
axes[0, 1].legend()

# Structural confidence
axes[1, 0].hist(df['structural_confidence'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1, 0].set_xlabel('Structural Confidence')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Structural Confidence Distribution')

# Biological relevance
axes[1, 1].hist(df['biological_relevance'], bins=30, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_xlabel('Biological Relevance')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Biological Relevance Distribution')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'score_distributions.png', dpi=300)
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation heatmap
score_cols = [
    'ipTM_score', 'interface_pLDDT_score', 'interface_area_score',
    'interaction_score', 'geometry_score', 'consistency_score',
    'expression_score', 'localization_score',
    'structural_confidence', 'biological_relevance', 'overall_score'
]

corr = df[score_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Score Correlation Matrix', fontsize=14, weight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'score_correlations.png', dpi=300)
plt.show()

## 6. Confidence Tiers

In [None]:
# Pie chart of confidence tiers
tier_counts = df['confidence_tier'].value_counts()

colors = {'high': '#2ecc71', 'medium': '#f39c12', 'low': '#e74c3c'}
color_list = [colors[tier] for tier in tier_counts.index]

plt.figure(figsize=(8, 8))
plt.pie(tier_counts.values, labels=[f"{tier.title()}\n({count})" for tier, count in tier_counts.items()],
        colors=color_list, autopct='%1.1f%%', startangle=90,
        textprops={'fontsize': 12, 'weight': 'bold'})
plt.title('Confidence Tier Distribution', fontsize=14, weight='bold')
plt.savefig(RESULTS_DIR / 'confidence_tiers.png', dpi=300)
plt.show()

## 7. Export Top Candidates for Validation

In [None]:
# Export top 10 for experimental validation
validation_list = df.head(10)[[
    'rank', 'gene_name', 'protein_id', 'protein_class',
    'overall_score', 'ipTM_score', 'interface_pLDDT_score',
    'structural_confidence', 'biological_relevance',
    'confidence_tier'
]]

validation_list.to_csv(RESULTS_DIR / 'top10_for_validation.csv', index=False)
print("âœ“ Exported top 10 candidates to top10_for_validation.csv")

validation_list

## 8. Summary Statistics

In [None]:
print("=" * 60)
print("PIPELINE SUMMARY STATISTICS")
print("=" * 60)
print()
print(f"Total Candidates: {len(df)}")
print()
print("Confidence Tiers:")
print(f"  High:   {(df['confidence_tier'] == 'high').sum()} ({(df['confidence_tier'] == 'high').sum()/len(df)*100:.1f}%)")
print(f"  Medium: {(df['confidence_tier'] == 'medium').sum()} ({(df['confidence_tier'] == 'medium').sum()/len(df)*100:.1f}%)")
print(f"  Low:    {(df['confidence_tier'] == 'low').sum()} ({(df['confidence_tier'] == 'low').sum()/len(df)*100:.1f}%)")
print()
print("ipTM Score Ranges:")
print(f"  > 0.8 (High):   {(df['ipTM_score'] > 0.8).sum()}")
print(f"  0.6-0.8 (Med):  {((df['ipTM_score'] >= 0.6) & (df['ipTM_score'] <= 0.8)).sum()}")
print(f"  < 0.6 (Low):    {(df['ipTM_score'] < 0.6).sum()}")
print()
print("Overall Score Statistics:")
print(f"  Mean:   {df['overall_score'].mean():.3f}")
print(f"  Median: {df['overall_score'].median():.3f}")
print(f"  Std:    {df['overall_score'].std():.3f}")
print(f"  Max:    {df['overall_score'].max():.3f} ({df.loc[df['overall_score'].idxmax(), 'gene_name']})")
print()
print("Recommended for Experimental Validation:")
print(f"  Top 3-5 candidates with ipTM > 0.8")
print("=" * 60)