In [1]:
#!/usr/bin/env python
# coding: utf-8



In [2]:
# # Prefix Consistency Gene-Level Accuracy Distribution Analysis
#
# This notebook analyzes how prefix consistency thresholds affect accuracy for individual genes within each gene type.
# It generates bar charts showing the distribution of sensitivity and false positive rates
# at different prefix consistency thresholds for each functional gene.



In [3]:
# ## Setup and Imports

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from collections import defaultdict
import os
import warnings
from IPython.display import display, Markdown
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 4)

# Define base directories (UPDATE THESE PATHS if necessary)
PREFIX_CONSISTENCY_BASE_DIR = "prefix-consistency/"
GROUND_TRUTH_BASE_DIR = "../../HPRC-assembly-benchmarking/digger-functional-annotations-all"

# List of gene types to process
GENE_TYPES_TO_PROCESS = ["IGHV", "IGLV", "IGKV", "TRAV", "TRGV", "TRDV"]



In [4]:
# ## Helper Functions for Data Loading
# 
# These functions are reused from the original code to load prefix consistency and ground truth data.



In [5]:
def load_ground_truth_data(gene_type, ground_truth_base_dir):
    """
    Load ground truth functional allele data for the given gene type.

    Args:
        gene_type (str): Gene type (e.g., "IGLV")
        ground_truth_base_dir (str): Path to the base directory containing sample ground truth folders.

    Returns:
        pd.DataFrame: DataFrame with sample_id and allele_id columns, or None if data cannot be loaded.
    """
    # For V-genes, the file prefix usually doesn't include 'V'
    file_prefix = gene_type.replace('V', '') if 'V' in gene_type else gene_type
    file_name = f"{file_prefix}_functional_alleles.txt"

    truth_data = []

    if not os.path.isdir(ground_truth_base_dir):
        print(f"❌ Error: Ground truth base directory not found: {ground_truth_base_dir}")
        return None

    # Find all sample directories
    try:
        sample_dirs = [d for d in os.listdir(ground_truth_base_dir)
                      if os.path.isdir(os.path.join(ground_truth_base_dir, d))]
    except FileNotFoundError:
        print(f"❌ Error: Could not list directories in {ground_truth_base_dir}")
        return None

    print(f"Found {len(sample_dirs)} potential sample directories in ground truth data.")
    if not sample_dirs:
        print(f"⚠️ Warning: No sample directories found in {ground_truth_base_dir}")
        return pd.DataFrame(columns=['sample_id', 'allele_id'])

    processed_samples = 0
    for sample_dir in sample_dirs:
        file_path = os.path.join(ground_truth_base_dir, sample_dir, file_name)

        # Extract sample ID from directory name
        sample_id = sample_dir.split('.')[0]

        # Check if the file exists
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                    loaded_count = 0
                    # Skip header lines starting with '#'
                    for line in lines:
                        if not line.startswith('#') and line.strip():
                            allele_id = line.strip()
                            truth_data.append({
                                'sample_id': sample_id,
                                'allele_id': allele_id
                            })
                            loaded_count += 1
                if loaded_count > 0:
                     processed_samples += 1
            except Exception as e:
                print(f"  ⚠️ Error reading {file_path}: {str(e)}")

    if not truth_data:
         print(f"⚠️ Warning: No ground truth data loaded for gene type {gene_type}.")
         return pd.DataFrame(columns=['sample_id', 'allele_id'])

    print(f"Successfully processed ground truth for {processed_samples} samples for {gene_type}.")
    # Create DataFrame from collected data
    truth_df = pd.DataFrame(truth_data)
    return truth_df



In [6]:
# ## Helper Functions for Calculating Gene-Level Metrics



In [7]:
def extract_gene_from_allele(allele_id):
    """
    Extract the gene name from an allele ID.
    
    Args:
        allele_id (str): Allele ID string (e.g., 'IGHV1-2*01')
        
    Returns:
        str: Gene name (e.g., 'IGHV1-2')
    """
    # Extract gene part (remove allele designation after *)
    if '*' in allele_id:
        return allele_id.split('*')[0]
    return allele_id

def calculate_gene_level_metrics(prefix_df):
    """
    Calculate gene-level metrics for different prefix consistency thresholds.
    
    Args:
        prefix_df (pd.DataFrame): DataFrame with prefix consistency data and ground truth status
        
    Returns:
        dict: Nested dictionary with metrics for each gene at each threshold
    """
    # Add gene column
    prefix_df['gene'] = prefix_df['allele_id'].apply(extract_gene_from_allele)
    
    # Initialize results dictionary
    gene_metrics = defaultdict(lambda: defaultdict(lambda: {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}))
    
    # Get unique genes
    genes = prefix_df['gene'].unique()
    
    # Process each threshold
    thresholds = range(6)  # 0-5
    
    for gene in genes:
        gene_df = prefix_df[prefix_df['gene'] == gene]
        
        # Skip genes with too few samples
        if len(gene_df) < 5:
            continue
            
        # Get TP and FP counts
        tp_alleles = gene_df[gene_df['status'] == 'TP']
        fp_alleles = gene_df[gene_df['status'] == 'FP']
        
        for threshold in thresholds:
            # For alleles above threshold
            tp_above = tp_alleles[tp_alleles['prefix_consistency'] >= threshold]
            fp_above = fp_alleles[fp_alleles['prefix_consistency'] >= threshold]
            
            # For alleles below threshold
            tp_below = tp_alleles[tp_alleles['prefix_consistency'] < threshold]
            fp_below = fp_alleles[fp_alleles['prefix_consistency'] < threshold]
            
            # Store counts
            gene_metrics[gene][threshold]['tp'] = len(tp_above)
            gene_metrics[gene][threshold]['fp'] = len(fp_above)
            gene_metrics[gene][threshold]['tn'] = len(fp_below)
            gene_metrics[gene][threshold]['fn'] = len(tp_below)
    
    return gene_metrics

def calculate_performance_metrics(gene_metrics):
    """
    Calculate sensitivity, false positive rate, PPV, and F-beta score for each gene at each threshold.
    
    Args:
        gene_metrics (dict): Nested dictionary with counts for each gene at each threshold
        
    Returns:
        pd.DataFrame: DataFrame with calculated metrics
    """
    results = []
    
    for gene, thresholds in gene_metrics.items():
        for threshold, counts in thresholds.items():
            # Calculate metrics
            sensitivity = counts['tp'] / (counts['tp'] + counts['fn']) if (counts['tp'] + counts['fn']) > 0 else 0
            fpr = counts['fp'] / (counts['fp'] + counts['tn']) if (counts['fp'] + counts['tn']) > 0 else 0
            # Calculate PPV (Positive Predictive Value)
            ppv = counts['tp'] / (counts['tp'] + counts['fp']) if (counts['tp'] + counts['fp']) > 0 else 0
            
            # Calculate F-beta score with beta = 0.5 (prioritizing PPV over sensitivity)
            beta = 0.5
            beta_squared = beta * beta
            f_beta = (1 + beta_squared) * (ppv * sensitivity) / (beta_squared * ppv + sensitivity) if (beta_squared * ppv + sensitivity) > 0 else 0
            
            # Store results
            results.append({
                'gene': gene,
                'threshold': threshold,
                'sensitivity': sensitivity,
                'fpr': fpr,
                'ppv': ppv,
                'f_beta': f_beta,
                'tp': counts['tp'],
                'fp': counts['fp'],
                'tn': counts['tn'],
                'fn': counts['fn'],
                'total_samples': counts['tp'] + counts['fp'] + counts['tn'] + counts['fn'],
                'passing_calls': counts['tp'] + counts['fp'],
                'passing_proportion': (counts['tp'] + counts['fp']) / (counts['tp'] + counts['fp'] + counts['tn'] + counts['fn']) 
                                     if (counts['tp'] + counts['fp'] + counts['tn'] + counts['fn']) > 0 else 0
            })
    
    return pd.DataFrame(results)



In [8]:
# ## New Functions for Finding Optimal Thresholds and Calculating Summary Metrics

def find_optimal_thresholds(metrics_df):
    """
    Find the threshold that maximizes F-beta score (beta=0.5) for each gene.
    Threshold 0 is excluded from consideration as an optimal threshold.
    
    Args:
        metrics_df (pd.DataFrame): DataFrame with gene metrics for different thresholds
        
    Returns:
        pd.DataFrame: DataFrame with optimal threshold for each gene
    """
    optimal_thresholds = []
    
    # Get all unique genes
    genes = metrics_df['gene'].unique()
    
    for gene in genes:
        # Filter data for this gene
        gene_df = metrics_df[metrics_df['gene'] == gene]
        
        # Exclude threshold 0 from consideration
        gene_df_filtered = gene_df[gene_df['threshold'] > 0]
        
        # If we have data after filtering
        if not gene_df_filtered.empty:
            # Find threshold with maximum F-beta score
            max_f_beta = gene_df_filtered['f_beta'].max()
            best_rows = gene_df_filtered[gene_df_filtered['f_beta'] == max_f_beta].sort_values('threshold', ascending=False)
            
            if not best_rows.empty:
                best_row = best_rows.iloc[0]
                
                optimal_thresholds.append({
                    'gene': gene,
                    'optimal_threshold': best_row['threshold'],
                    'f_beta': best_row['f_beta'],
                    'ppv': best_row['ppv'],
                    'sensitivity': best_row['sensitivity'],
                    'fpr': best_row['fpr'],
                    'passing_proportion': best_row['passing_proportion']
                })
    
    return pd.DataFrame(optimal_thresholds)

def calculate_gene_type_metrics(optimal_thresholds_df):
    """
    Calculate average and median metrics across all genes for a gene type.
    
    Args:
        optimal_thresholds_df (pd.DataFrame): DataFrame with optimal thresholds for each gene
        
    Returns:
        dict: Dictionary with aggregated metrics
    """
    metrics = {
        'num_genes': len(optimal_thresholds_df),
        'mean_ppv': optimal_thresholds_df['ppv'].mean(),
        'median_ppv': optimal_thresholds_df['ppv'].median(),
        'mean_sensitivity': optimal_thresholds_df['sensitivity'].mean(),
        'median_sensitivity': optimal_thresholds_df['sensitivity'].median(),
        'mean_f_beta': optimal_thresholds_df['f_beta'].mean(),
        'median_f_beta': optimal_thresholds_df['f_beta'].median(),
        'mean_passing_proportion': optimal_thresholds_df['passing_proportion'].mean(),
        'median_passing_proportion': optimal_thresholds_df['passing_proportion'].median(),
        'threshold_distribution': optimal_thresholds_df['optimal_threshold'].value_counts().to_dict()
    }
    
    return metrics

def create_summary_table(gene_types_metrics):
    """
    Create a summary table of metrics for all gene types.
    
    Args:
        gene_types_metrics (dict): Dictionary with metrics for each gene type
        
    Returns:
        pd.DataFrame: Summary table
    """
    summary_data = []
    
    for gene_type, metrics in gene_types_metrics.items():
        summary_data.append({
            'Gene Type': gene_type,
            'Number of Genes': metrics['num_genes'],
            'Mean PPV': metrics['mean_ppv'],
            'Median PPV': metrics['median_ppv'],
            'Mean Sensitivity': metrics['mean_sensitivity'],
            'Median Sensitivity': metrics['median_sensitivity'],
            'Mean F-beta (beta=0.5)': metrics['mean_f_beta'],
            'Median F-beta (beta=0.5)': metrics['median_f_beta'],
            'Mean Passing Proportion': metrics['mean_passing_proportion'],
            'Median Passing Proportion': metrics['median_passing_proportion']
        })
    
    summary_df = pd.DataFrame(summary_data)
    
    # Format numeric columns
    numeric_cols = [
        'Mean PPV', 'Median PPV', 
        'Mean Sensitivity', 'Median Sensitivity',
        'Mean F-beta (beta=0.5)', 'Median F-beta (beta=0.5)',
        'Mean Passing Proportion', 'Median Passing Proportion'
    ]
    
    for col in numeric_cols:
        summary_df[col] = summary_df[col].map('{:.4f}'.format)
    
    return summary_df



In [9]:
# ## Removed per-gene plotting functionality as it's not needed
# ## We'll focus only on the summary table generation



In [10]:
# ## Main Processing Function

def process_gene_type(gene_type, collect_metrics=True):
    """
    Process data for a specific gene type.
    
    Args:
        gene_type (str): Gene type to process
        collect_metrics (bool): Whether to return metrics for summary table
        
    Returns:
        dict: Dictionary with gene type metrics (if collect_metrics=True)
    """
    display(Markdown(f"## Processing Gene Type: {gene_type}"))
    
    # Define paths
    results_dir = os.path.join(PREFIX_CONSISTENCY_BASE_DIR, f"{gene_type.lower()}/")
    all_samples_file = os.path.join(results_dir, f"all_samples_{gene_type}_prefix_consistency.csv")
    
    # Initialize return value
    gene_type_metrics = None
    
    # Check if file exists
    if not os.path.exists(all_samples_file):
        display(Markdown(f"⚠️ **Warning**: Prefix consistency file not found for {gene_type}. Please check the path."))
        display(Markdown(f"  *Expected path: `{all_samples_file}`*"))
    else:
        # Load prefix consistency data
        try:
            prefix_df = pd.read_csv(all_samples_file)
            display(Markdown(f"✅ Loaded prefix consistency data for **{gene_type}**"))
            display(Markdown(f"* Total rows: {len(prefix_df)}"))
            display(Markdown(f"* Number of samples: {prefix_df['sample_id'].nunique()}"))
            display(Markdown(f"* Number of unique alleles: {prefix_df['allele_id'].nunique()}"))
            
            # Filter to functional alleles only
            prefix_df = prefix_df[prefix_df['is_functional']].copy()
            prefix_df = prefix_df[prefix_df['in_optimal']].copy()
            display(Markdown(f"* Filtered to {len(prefix_df)} functional alleles"))
            
            # Load ground truth data
            truth_df = load_ground_truth_data(gene_type, GROUND_TRUTH_BASE_DIR)
            
            if truth_df is not None and not truth_df.empty:
                display(Markdown(f"✅ Loaded ground truth data with {len(truth_df)} entries"))
                
                # Prepare for comparison - find common samples
                prefix_samples = set(prefix_df['sample_id'].unique())
                truth_samples = set(truth_df['sample_id'].unique())
                common_samples = prefix_samples.intersection(truth_samples)
                
                display(Markdown(f"* Found {len(common_samples)} common samples for comparison"))
                
                # Filter to common samples
                prefix_df_filtered = prefix_df[prefix_df['sample_id'].isin(common_samples)].copy()
                truth_df_filtered = truth_df[truth_df['sample_id'].isin(common_samples)].copy()
                
                # Create sample-allele keys
                prefix_df_filtered['sample_allele'] = prefix_df_filtered['sample_id'] + '_' + prefix_df_filtered['allele_id']
                truth_df_filtered['sample_allele'] = truth_df_filtered['sample_id'] + '_' + truth_df_filtered['allele_id']
                
                # Find TP and FP sets
                pred_set = set(prefix_df_filtered['sample_allele'])
                truth_set = set(truth_df_filtered['sample_allele'])
                tp_set = pred_set.intersection(truth_set)
                fp_set = pred_set - truth_set
                
                # Assign TP/FP status
                prefix_df_filtered['status'] = 'Unknown'
                prefix_df_filtered.loc[prefix_df_filtered['sample_allele'].isin(tp_set), 'status'] = 'TP'
                prefix_df_filtered.loc[prefix_df_filtered['sample_allele'].isin(fp_set), 'status'] = 'FP'
                
                # Filter to known status
                known_status_df = prefix_df_filtered[prefix_df_filtered['status'].isin(['TP', 'FP'])].copy()
                
                if not known_status_df.empty:
                    display(Markdown(f"* Identified {(known_status_df['status'] == 'TP').sum()} True Positives"))
                    display(Markdown(f"* Identified {(known_status_df['status'] == 'FP').sum()} False Positives"))
                    
                    # Calculate gene-level metrics
                    display(Markdown("## Calculating Gene-Level Metrics"))
                    gene_metrics = calculate_gene_level_metrics(known_status_df)
                    
                    # Convert to DataFrame for visualization
                    metrics_df = calculate_performance_metrics(gene_metrics)
                    
                    display(Markdown(f"* Calculated metrics for {metrics_df['gene'].nunique()} genes across 6 thresholds"))
                    
                    # Display sample of metrics for inspection
                    display(Markdown("### Sample of Calculated Metrics"))
                    display(metrics_df.head(10))
                    
                    # Find optimal threshold for each gene based on F-beta score (beta=0.5)
                    display(Markdown("## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)"))
                    optimal_thresholds = find_optimal_thresholds(metrics_df)
                    display(Markdown(f"* Identified optimal thresholds for {len(optimal_thresholds)} genes"))
                    display(optimal_thresholds.head())
                    
                    # Calculate gene type metrics
                    if collect_metrics:
                        gene_type_metrics = calculate_gene_type_metrics(optimal_thresholds)
                        display(Markdown("## Gene Type Metrics Summary"))
                        display(Markdown(f"* Number of genes analyzed: {gene_type_metrics['num_genes']}"))
                        display(Markdown(f"* Mean PPV: {gene_type_metrics['mean_ppv']:.4f}"))
                        display(Markdown(f"* Median PPV: {gene_type_metrics['median_ppv']:.4f}"))
                        display(Markdown(f"* Mean Sensitivity: {gene_type_metrics['mean_sensitivity']:.4f}"))
                        display(Markdown(f"* Median Sensitivity: {gene_type_metrics['median_sensitivity']:.4f}"))
                        display(Markdown(f"* Mean F-beta (beta=0.5): {gene_type_metrics['mean_f_beta']:.4f}"))
                        display(Markdown(f"* Median F-beta (beta=0.5): {gene_type_metrics['median_f_beta']:.4f}"))
                        display(Markdown(f"* Mean passing proportion: {gene_type_metrics['mean_passing_proportion']:.4f}"))
                        display(Markdown(f"* Median passing proportion: {gene_type_metrics['median_passing_proportion']:.4f}"))
                        
                        # Display threshold distribution
                        display(Markdown("* Optimal threshold distribution:"))
                        threshold_dist = pd.Series(gene_type_metrics['threshold_distribution']).sort_index()
                        display(threshold_dist)
                    
                    # Removed per-gene visualization section as it's not needed
                    
                else:
                    display(Markdown("❌ No TP or FP calls identified after comparison. Cannot proceed with analysis."))
            else:
                display(Markdown("❌ Failed to load ground truth data or no data found."))
        
        except Exception as e:
            display(Markdown(f"❌ **Error**: An error occurred during processing: {str(e)}"))
    
    return gene_type_metrics



In [11]:
# ## Main Processing Loop
# Process all gene types and generate summary table

# Initialize dictionary to store metrics for all gene types
gene_types_metrics = {}

# Process each gene type
for gene_type in GENE_TYPES_TO_PROCESS:
    gene_type_metrics = process_gene_type(gene_type)
    if gene_type_metrics:
        gene_types_metrics[gene_type] = gene_type_metrics

# Create and display summary table
if gene_types_metrics:
    display(Markdown("# Summary Table of Gene Type Metrics"))
    summary_table = create_summary_table(gene_types_metrics)
    display(summary_table)
    
    # Generate LaTeX code for the table
    latex_table = summary_table.to_latex(index=False)
    display(Markdown("## LaTeX code for the table:"))
    display(Markdown(f"```latex\n{latex_table}\n```"))
    


## Processing Gene Type: IGHV

✅ Loaded prefix consistency data for **IGHV**

* Total rows: 12693

* Number of samples: 39

* Number of unique alleles: 643

* Filtered to 2356 functional alleles

Found 94 potential sample directories in ground truth data.
Successfully processed ground truth for 94 samples for IGHV.


✅ Loaded ground truth data with 4009 entries

* Found 39 common samples for comparison

* Identified 1964 True Positives

* Identified 392 False Positives

## Calculating Gene-Level Metrics

* Calculated metrics for 52 genes across 6 thresholds

### Sample of Calculated Metrics

Unnamed: 0,gene,threshold,sensitivity,fpr,ppv,f_beta,tp,fp,tn,fn,total_samples,passing_calls,passing_proportion
0,IGHV3-64D,0,1.0,1.0,0.8462,0.873,11,2,0,0,13,13,1.0
1,IGHV3-64D,1,0.2727,0.0,1.0,0.6522,3,0,2,8,13,3,0.2308
2,IGHV3-64D,2,0.2727,0.0,1.0,0.6522,3,0,2,8,13,3,0.2308
3,IGHV3-64D,3,0.0909,0.0,1.0,0.3333,1,0,2,10,13,1,0.0769
4,IGHV3-64D,4,0.0,0.0,0.0,0.0,0,0,2,11,13,0,0.0
5,IGHV3-64D,5,0.0,0.0,0.0,0.0,0,0,2,11,13,0,0.0
6,IGHV4-38-2,0,1.0,1.0,0.8857,0.9064,31,4,0,0,35,35,1.0
7,IGHV4-38-2,1,0.3548,0.0,1.0,0.7333,11,0,4,20,35,11,0.3143
8,IGHV4-38-2,2,0.2581,0.0,1.0,0.6349,8,0,4,23,35,8,0.2286
9,IGHV4-38-2,3,0.2258,0.0,1.0,0.5932,7,0,4,24,35,7,0.2


## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)

* Identified optimal thresholds for 52 genes

Unnamed: 0,gene,optimal_threshold,f_beta,ppv,sensitivity,fpr,passing_proportion
0,IGHV3-64D,2,0.6522,1.0,0.2727,0.0,0.2308
1,IGHV4-38-2,1,0.7333,1.0,0.3548,0.0,0.3143
2,IGHV3-13,1,0.5455,0.6667,0.3158,0.375,0.3333
3,IGHV3-21,1,0.5333,0.8889,0.2051,0.25,0.2093
4,IGHV1-2,1,0.625,0.9,0.2812,0.2,0.2703


## Gene Type Metrics Summary

* Number of genes analyzed: 52

* Mean PPV: 0.8771

* Median PPV: 0.9310

* Mean Sensitivity: 0.3637

* Median Sensitivity: 0.3588

* Mean F-beta (beta=0.5): 0.6677

* Median F-beta (beta=0.5): 0.6948

* Mean passing proportion: 0.3390

* Median passing proportion: 0.3359

* Optimal threshold distribution:

1    47
2     3
3     1
5     1
dtype: int64

## Processing Gene Type: IGLV

✅ Loaded prefix consistency data for **IGLV**

* Total rows: 5135

* Number of samples: 40

* Number of unique alleles: 173

* Filtered to 1540 functional alleles

Found 94 potential sample directories in ground truth data.
Successfully processed ground truth for 94 samples for IGLV.


✅ Loaded ground truth data with 3047 entries

* Found 40 common samples for comparison

* Identified 1470 True Positives

* Identified 70 False Positives

## Calculating Gene-Level Metrics

* Calculated metrics for 33 genes across 6 thresholds

### Sample of Calculated Metrics

Unnamed: 0,gene,threshold,sensitivity,fpr,ppv,f_beta,tp,fp,tn,fn,total_samples,passing_calls,passing_proportion
0,IGLV3-12,0,1.0,1.0,0.7586,0.7971,22,7,0,0,29,29,1.0
1,IGLV3-12,1,1.0,0.8571,0.7857,0.8209,22,6,1,0,29,28,0.9655
2,IGLV3-12,2,0.8636,0.8571,0.76,0.7787,19,6,1,3,29,25,0.8621
3,IGLV3-12,3,0.7273,0.7143,0.7619,0.7547,16,5,2,6,29,21,0.7241
4,IGLV3-12,4,0.6364,0.5714,0.7778,0.7447,14,4,3,8,29,18,0.6207
5,IGLV3-12,5,0.0,0.0,0.0,0.0,0,0,7,22,29,0,0.0
6,IGLV4-69,0,1.0,1.0,0.9756,0.9804,40,1,0,0,41,41,1.0
7,IGLV4-69,1,1.0,0.0,1.0,1.0,40,0,1,0,41,40,0.9756
8,IGLV4-69,2,0.625,0.0,1.0,0.8929,25,0,1,15,41,25,0.6098
9,IGLV4-69,3,0.35,0.0,1.0,0.7292,14,0,1,26,41,14,0.3415


## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)

* Identified optimal thresholds for 33 genes

Unnamed: 0,gene,optimal_threshold,f_beta,ppv,sensitivity,fpr,passing_proportion
0,IGLV3-12,1,0.8209,0.7857,1.0,0.8571,0.9655
1,IGLV4-69,1,1.0,1.0,1.0,0.0,0.9756
2,IGLV7-43,1,1.0,1.0,1.0,0.0,1.0
3,IGLV3-9,1,1.0,1.0,1.0,0.0,1.0
4,IGLV10-54,1,0.9697,0.9846,0.9143,0.3333,0.8904


## Gene Type Metrics Summary

* Number of genes analyzed: 33

* Mean PPV: 0.9633

* Median PPV: 1.0000

* Mean Sensitivity: 0.9701

* Median Sensitivity: 0.9778

* Mean F-beta (beta=0.5): 0.9635

* Median F-beta (beta=0.5): 0.9877

* Mean passing proportion: 0.9621

* Median passing proportion: 0.9756

* Optimal threshold distribution:

1    31
2     2
dtype: int64

## Processing Gene Type: IGKV

✅ Loaded prefix consistency data for **IGKV**

* Total rows: 5009

* Number of samples: 40

* Number of unique alleles: 154

* Filtered to 1621 functional alleles

Found 94 potential sample directories in ground truth data.
Successfully processed ground truth for 94 samples for IGKV.


✅ Loaded ground truth data with 3043 entries

* Found 40 common samples for comparison

* Identified 1305 True Positives

* Identified 316 False Positives

## Calculating Gene-Level Metrics

* Calculated metrics for 37 genes across 6 thresholds

### Sample of Calculated Metrics

Unnamed: 0,gene,threshold,sensitivity,fpr,ppv,f_beta,tp,fp,tn,fn,total_samples,passing_calls,passing_proportion
0,IGKV6D-21,0,1.0,0.0,1.0,1.0,40,0,0,0,40,40,1.0
1,IGKV6D-21,1,0.8,0.0,1.0,0.9524,32,0,0,8,40,32,0.8
2,IGKV6D-21,2,0.4,0.0,1.0,0.7692,16,0,0,24,40,16,0.4
3,IGKV6D-21,3,0.225,0.0,1.0,0.5921,9,0,0,31,40,9,0.225
4,IGKV6D-21,4,0.15,0.0,1.0,0.4688,6,0,0,34,40,6,0.15
5,IGKV6D-21,5,0.0,0.0,0.0,0.0,0,0,0,40,40,0,0.0
6,IGKV1-27,0,1.0,1.0,0.6133,0.6647,46,29,0,0,75,75,1.0
7,IGKV1-27,1,0.7609,0.6552,0.6481,0.6679,35,19,10,11,75,54,0.72
8,IGKV1-27,2,0.3696,0.4483,0.5667,0.512,17,13,16,29,75,30,0.4
9,IGKV1-27,3,0.2609,0.3103,0.5714,0.4615,12,9,20,34,75,21,0.28


## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)

* Identified optimal thresholds for 37 genes

Unnamed: 0,gene,optimal_threshold,f_beta,ppv,sensitivity,fpr,passing_proportion
0,IGKV6D-21,1,0.9524,1.0,0.8,0.0,0.8
1,IGKV1-27,1,0.6679,0.6481,0.7609,0.6552,0.72
2,IGKV1D-13,5,0.0,0.0,0.0,0.0,0.0
3,IGKV3D-15,1,0.8768,0.9024,0.7872,0.8,0.7885
4,IGKV3D-20,1,0.9593,1.0,0.825,0.0,0.825


## Gene Type Metrics Summary

* Number of genes analyzed: 37

* Mean PPV: 0.8203

* Median PPV: 0.8966

* Mean Sensitivity: 0.7196

* Median Sensitivity: 0.8043

* Mean F-beta (beta=0.5): 0.7943

* Median F-beta (beta=0.5): 0.8599

* Mean passing proportion: 0.6982

* Median passing proportion: 0.7692

* Optimal threshold distribution:

1    32
2     2
4     1
5     2
dtype: int64

## Processing Gene Type: TRAV

✅ Loaded prefix consistency data for **TRAV**

* Total rows: 3655

* Number of samples: 40

* Number of unique alleles: 132

* Filtered to 2268 functional alleles

Found 94 potential sample directories in ground truth data.
Successfully processed ground truth for 94 samples for TRAV.


✅ Loaded ground truth data with 4007 entries

* Found 40 common samples for comparison

* Identified 1783 True Positives

* Identified 485 False Positives

## Calculating Gene-Level Metrics

* Calculated metrics for 45 genes across 6 thresholds

### Sample of Calculated Metrics

Unnamed: 0,gene,threshold,sensitivity,fpr,ppv,f_beta,tp,fp,tn,fn,total_samples,passing_calls,passing_proportion
0,TRAV8-2,0,1.0,0.0,1.0,1.0,52,0,0,0,52,52,1.0
1,TRAV8-2,1,0.9615,0.0,1.0,0.9921,50,0,0,2,52,50,0.9615
2,TRAV8-2,2,0.5962,0.0,1.0,0.8807,31,0,0,21,52,31,0.5962
3,TRAV8-2,3,0.4038,0.0,1.0,0.7721,21,0,0,31,52,21,0.4038
4,TRAV8-2,4,0.2308,0.0,1.0,0.6,12,0,0,40,52,12,0.2308
5,TRAV8-2,5,0.0,0.0,0.0,0.0,0,0,0,52,52,0,0.0
6,TRAV4,0,0.0,1.0,0.0,0.0,0,40,0,0,40,40,1.0
7,TRAV4,1,0.0,1.0,0.0,0.0,0,40,0,0,40,40,1.0
8,TRAV4,2,0.0,0.925,0.0,0.0,0,37,3,0,40,37,0.925
9,TRAV4,3,0.0,0.9,0.0,0.0,0,36,4,0,40,36,0.9


## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)

* Identified optimal thresholds for 45 genes

Unnamed: 0,gene,optimal_threshold,f_beta,ppv,sensitivity,fpr,passing_proportion
0,TRAV8-2,1,0.9921,1.0,0.9615,0.0,0.9615
1,TRAV4,5,0.0,0.0,0.0,0.0,0.0
2,TRAV8-4,1,0.5837,0.5455,0.8108,0.8065,0.8088
3,TRAV12-2,1,0.4307,0.3833,0.8519,0.8043,0.8219
4,TRAV21,1,0.7054,0.6667,0.9189,0.85,0.8947


## Gene Type Metrics Summary

* Number of genes analyzed: 45

* Mean PPV: 0.8263

* Median PPV: 1.0000

* Mean Sensitivity: 0.8770

* Median Sensitivity: 0.9762

* Mean F-beta (beta=0.5): 0.8320

* Median F-beta (beta=0.5): 0.9921

* Mean passing proportion: 0.8786

* Median passing proportion: 0.9615

* Optimal threshold distribution:

1    40
3     1
4     1
5     3
dtype: int64

## Processing Gene Type: TRGV

✅ Loaded prefix consistency data for **TRGV**

* Total rows: 866

* Number of samples: 40

* Number of unique alleles: 24

* Filtered to 266 functional alleles

Found 94 potential sample directories in ground truth data.
Successfully processed ground truth for 94 samples for TRGV.


✅ Loaded ground truth data with 596 entries

* Found 40 common samples for comparison

* Identified 245 True Positives

* Identified 21 False Positives

## Calculating Gene-Level Metrics

* Calculated metrics for 6 genes across 6 thresholds

### Sample of Calculated Metrics

Unnamed: 0,gene,threshold,sensitivity,fpr,ppv,f_beta,tp,fp,tn,fn,total_samples,passing_calls,passing_proportion
0,TRGV2,0,1.0,1.0,0.7167,0.7597,43,17,0,0,60,60,1.0
1,TRGV2,1,0.9302,1.0,0.7018,0.738,40,17,0,3,60,57,0.95
2,TRGV2,2,0.5349,0.4706,0.7419,0.6886,23,8,9,20,60,31,0.5167
3,TRGV2,3,0.2558,0.2353,0.7333,0.534,11,4,13,32,60,15,0.25
4,TRGV2,4,0.093,0.1176,0.6667,0.2985,4,2,15,39,60,6,0.1
5,TRGV2,5,0.0,0.0,0.0,0.0,0,0,17,43,60,0,0.0
6,TRGV9,0,1.0,1.0,0.9524,0.9615,40,2,0,0,42,42,1.0
7,TRGV9,1,1.0,1.0,0.9524,0.9615,40,2,0,0,42,42,1.0
8,TRGV9,2,0.9,1.0,0.9474,0.9375,36,2,0,4,42,38,0.9048
9,TRGV9,3,0.625,1.0,0.9259,0.8446,25,2,0,15,42,27,0.6429


## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)

* Identified optimal thresholds for 6 genes

Unnamed: 0,gene,optimal_threshold,f_beta,ppv,sensitivity,fpr,passing_proportion
0,TRGV2,1,0.738,0.7018,0.9302,1.0,0.95
1,TRGV9,1,0.9615,0.9524,1.0,1.0,1.0
2,TRGV3,1,0.9606,0.9512,1.0,1.0,1.0
3,TRGV4,1,0.9955,1.0,0.9778,0.0,0.9778
4,TRGV8,1,1.0,1.0,1.0,0.0,1.0


## Gene Type Metrics Summary

* Number of genes analyzed: 6

* Mean PPV: 0.9342

* Median PPV: 0.9762

* Mean Sensitivity: 0.9847

* Median Sensitivity: 1.0000

* Mean F-beta (beta=0.5): 0.9426

* Median F-beta (beta=0.5): 0.9785

* Mean passing proportion: 0.9880

* Median passing proportion: 1.0000

* Optimal threshold distribution:

1    6
dtype: int64

## Processing Gene Type: TRDV

✅ Loaded prefix consistency data for **TRDV**

* Total rows: 191

* Number of samples: 40

* Number of unique alleles: 6

* Filtered to 145 functional alleles

Found 94 potential sample directories in ground truth data.
Successfully processed ground truth for 40 samples for TRDV.


✅ Loaded ground truth data with 40 entries

* Found 30 common samples for comparison

* Identified 31 True Positives

* Identified 77 False Positives

## Calculating Gene-Level Metrics

* Calculated metrics for 3 genes across 6 thresholds

### Sample of Calculated Metrics

Unnamed: 0,gene,threshold,sensitivity,fpr,ppv,f_beta,tp,fp,tn,fn,total_samples,passing_calls,passing_proportion
0,TRDV1,0,0.0,1.0,0.0,0.0,0,30,0,0,30,30,1.0
1,TRDV1,1,0.0,1.0,0.0,0.0,0,30,0,0,30,30,1.0
2,TRDV1,2,0.0,0.9333,0.0,0.0,0,28,2,0,30,28,0.9333
3,TRDV1,3,0.0,0.9333,0.0,0.0,0,28,2,0,30,28,0.9333
4,TRDV1,4,0.0,0.9333,0.0,0.0,0,28,2,0,30,28,0.9333
5,TRDV1,5,0.0,0.0,0.0,0.0,0,0,30,0,30,0,0.0
6,TRDV3,0,0.0,1.0,0.0,0.0,0,35,0,0,35,35,1.0
7,TRDV3,1,0.0,1.0,0.0,0.0,0,35,0,0,35,35,1.0
8,TRDV3,2,0.0,0.8,0.0,0.0,0,28,7,0,35,28,0.8
9,TRDV3,3,0.0,0.6571,0.0,0.0,0,23,12,0,35,23,0.6571


## Finding Optimal Thresholds Based on F-beta Score (beta=0.5)

* Identified optimal thresholds for 3 genes

Unnamed: 0,gene,optimal_threshold,f_beta,ppv,sensitivity,fpr,passing_proportion
0,TRDV1,5,0.0,0.0,0.0,0.0,0.0
1,TRDV3,5,0.0,0.0,0.0,0.0,0.0
2,TRDV2,1,0.7635,0.7209,1.0,1.0,1.0


## Gene Type Metrics Summary

* Number of genes analyzed: 3

* Mean PPV: 0.2403

* Median PPV: 0.0000

* Mean Sensitivity: 0.3333

* Median Sensitivity: 0.0000

* Mean F-beta (beta=0.5): 0.2545

* Median F-beta (beta=0.5): 0.0000

* Mean passing proportion: 0.3333

* Median passing proportion: 0.0000

* Optimal threshold distribution:

1    1
5    2
dtype: int64

# Summary Table of Gene Type Metrics

Unnamed: 0,Gene Type,Number of Genes,Mean PPV,Median PPV,Mean Sensitivity,Median Sensitivity,Mean F-beta (beta=0.5),Median F-beta (beta=0.5),Mean Passing Proportion,Median Passing Proportion
0,IGHV,52,0.8771,0.931,0.3637,0.3588,0.6677,0.6948,0.339,0.3359
1,IGLV,33,0.9633,1.0,0.9701,0.9778,0.9635,0.9877,0.9621,0.9756
2,IGKV,37,0.8203,0.8966,0.7196,0.8043,0.7943,0.8599,0.6982,0.7692
3,TRAV,45,0.8263,1.0,0.877,0.9762,0.832,0.9921,0.8786,0.9615
4,TRGV,6,0.9342,0.9762,0.9847,1.0,0.9426,0.9785,0.988,1.0
5,TRDV,3,0.2403,0.0,0.3333,0.0,0.2545,0.0,0.3333,0.0


## LaTeX code for the table:

```latex
\begin{tabular}{lrllllllll}
\toprule
Gene Type & Number of Genes & Mean PPV & Median PPV & Mean Sensitivity & Median Sensitivity & Mean F-beta (beta=0.5) & Median F-beta (beta=0.5) & Mean Passing Proportion & Median Passing Proportion \\
\midrule
IGHV & 52 & 0.8771 & 0.9310 & 0.3637 & 0.3588 & 0.6677 & 0.6948 & 0.3390 & 0.3359 \\
IGLV & 33 & 0.9633 & 1.0000 & 0.9701 & 0.9778 & 0.9635 & 0.9877 & 0.9621 & 0.9756 \\
IGKV & 37 & 0.8203 & 0.8966 & 0.7196 & 0.8043 & 0.7943 & 0.8599 & 0.6982 & 0.7692 \\
TRAV & 45 & 0.8263 & 1.0000 & 0.8770 & 0.9762 & 0.8320 & 0.9921 & 0.8786 & 0.9615 \\
TRGV & 6 & 0.9342 & 0.9762 & 0.9847 & 1.0000 & 0.9426 & 0.9785 & 0.9880 & 1.0000 \\
TRDV & 3 & 0.2403 & 0.0000 & 0.3333 & 0.0000 & 0.2545 & 0.0000 & 0.3333 & 0.0000 \\
\bottomrule
\end{tabular}

```

In [12]:
# ## Simplified Summary Table
# Create a more focused table with just key metrics

if gene_types_metrics:
    display(Markdown("# Simplified Summary Table"))
    
    # Create simplified data
    simplified_data = []
    
    for gene_type, metrics in gene_types_metrics.items():
        simplified_data.append({
            'Gene Type': gene_type,
            'Mean PPV': metrics['mean_ppv'],
            'Median PPV': metrics['median_ppv'],
            'Mean Passing Proportion': metrics['mean_passing_proportion'],
            'Median Passing Proportion': metrics['median_passing_proportion']
        })
    
    # Create DataFrame
    simplified_df = pd.DataFrame(simplified_data)
    
    # Format numeric columns
    for col in ['Mean PPV', 'Median PPV', 'Mean Passing Proportion', 'Median Passing Proportion']:
        simplified_df[col] = simplified_df[col].map('{:.4f}'.format)
    
    # Display table
    display(simplified_df)
    
    # Generate LaTeX code
    latex_simplified = simplified_df.to_latex(index=False)
    display(Markdown("## LaTeX code for simplified table:"))
    display(Markdown(f"```latex\n{latex_simplified}\n```"))

# Simplified Summary Table

Unnamed: 0,Gene Type,Mean PPV,Median PPV,Mean Passing Proportion,Median Passing Proportion
0,IGHV,0.8771,0.931,0.339,0.3359
1,IGLV,0.9633,1.0,0.9621,0.9756
2,IGKV,0.8203,0.8966,0.6982,0.7692
3,TRAV,0.8263,1.0,0.8786,0.9615
4,TRGV,0.9342,0.9762,0.988,1.0
5,TRDV,0.2403,0.0,0.3333,0.0


## LaTeX code for simplified table:

```latex
\begin{tabular}{lllll}
\toprule
Gene Type & Mean PPV & Median PPV & Mean Passing Proportion & Median Passing Proportion \\
\midrule
IGHV & 0.8771 & 0.9310 & 0.3390 & 0.3359 \\
IGLV & 0.9633 & 1.0000 & 0.9621 & 0.9756 \\
IGKV & 0.8203 & 0.8966 & 0.6982 & 0.7692 \\
TRAV & 0.8263 & 1.0000 & 0.8786 & 0.9615 \\
TRGV & 0.9342 & 0.9762 & 0.9880 & 1.0000 \\
TRDV & 0.2403 & 0.0000 & 0.3333 & 0.0000 \\
\bottomrule
\end{tabular}

```