In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from glob import glob
import pandas as pd

In [3]:
from benchmark_utils import *

# No CNV

In [4]:
call_base_dir="../immunotyper-output/20x"

In [7]:
gene_types = ['IGHV', 'IGLV', 'IGKV', 'TRAV', 'TRBV', 'TRGV', 'TRDV']

first = True

for gene_type in gene_types:
    functional_ground_truths = load_assembly_annotations(gene_type, path="../../digger-functional-annotations-all/")
    print(f"Gene type: {gene_type}")
    print(f"Number of samples: {len(functional_ground_truths)}")
    print(f"Number of unique functional alleles: {len(set([allele for alleles in functional_ground_truths.values() for allele in alleles]))}")
    print()

    # Create empty lists to store results
    results = []

    for sample, functional_ground_truth_set in functional_ground_truths.items():

        try:
            genotypes = load_genotype_calls(sample, gene_type, call_base_dir=call_base_dir)
        except FileNotFoundError:
            # log.warning(f"Genotype calls not found for {sample}")
            continue

        
        tp, fp, fn = calculate_accuracy_metrics(list(set(genotypes)), list(set(functional_ground_truth_set)))
        precision, recall = calculate_precision_recall(tp, fp, fn)
        
        # Store results for this sample
        results.append({
            'sample': sample,
            'precision': precision,
            'recall': recall,
            'true_positives': tp,
            'false_positives': fp,
            'false_negatives': fn,
            'genotypes': list(set(genotypes)),
            'ground_truth': list(set(functional_ground_truth_set)),
            'gene_type': gene_type
        })

    gene_results_df = pd.DataFrame(results)
    if first:
        results_df = gene_results_df
        first = False
    else:    
        results_df = pd.concat([results_df, gene_results_df], ignore_index=True)


ERROR:benchmark_utils:No genotype calls file found for HG002. Pattern: ../immunotyper-output/20x/ighv/HG002.final*IGHV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG005. Pattern: ../immunotyper-output/20x/ighv/HG005.final*IGHV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG01123. Pattern: ../immunotyper-output/20x/ighv/HG01123.final*IGHV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02109. Pattern: ../immunotyper-output/20x/ighv/HG02109.final*IGHV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02486. Pattern: ../immunotyper-output/20x/ighv/HG02486.final*IGHV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02559. Pattern: ../immunotyper-output/20x/ighv/HG02559.final*IGHV_functional_allele_calls.txt


Loading annotations for IGHV from ../../digger-functional-annotations-all/
Gene type: IGHV
Number of samples: 47
Number of unique functional alleles: 170



ERROR:benchmark_utils:No genotype calls file found for NA21309. Pattern: ../immunotyper-output/20x/ighv/NA21309.final*IGHV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG002. Pattern: ../immunotyper-output/20x/iglv/HG002.final*IGLV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG005. Pattern: ../immunotyper-output/20x/iglv/HG005.final*IGLV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG01123. Pattern: ../immunotyper-output/20x/iglv/HG01123.final*IGLV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02109. Pattern: ../immunotyper-output/20x/iglv/HG02109.final*IGLV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02486. Pattern: ../immunotyper-output/20x/iglv/HG02486.final*IGLV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02559. Pattern: ../immunotyper-output/20x/igl

Loading annotations for IGLV from ../../digger-functional-annotations-all/
Gene type: IGLV
Number of samples: 47
Number of unique functional alleles: 69

Loading annotations for IGKV from ../../digger-functional-annotations-all/


ERROR:benchmark_utils:No genotype calls file found for HG002. Pattern: ../immunotyper-output/20x/igkv/HG002.final*IGKV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG005. Pattern: ../immunotyper-output/20x/igkv/HG005.final*IGKV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG01123. Pattern: ../immunotyper-output/20x/igkv/HG01123.final*IGKV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02109. Pattern: ../immunotyper-output/20x/igkv/HG02109.final*IGKV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02486. Pattern: ../immunotyper-output/20x/igkv/HG02486.final*IGKV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02559. Pattern: ../immunotyper-output/20x/igkv/HG02559.final*IGKV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for NA21309. Pattern: ../immunotyper-output/20x/igk

Gene type: IGKV
Number of samples: 47
Number of unique functional alleles: 80

Loading annotations for TRAV from ../../digger-functional-annotations-all/
Gene type: TRAV
Number of samples: 47
Number of unique functional alleles: 59



ERROR:benchmark_utils:No genotype calls file found for HG002. Pattern: ../immunotyper-output/20x/trav/HG002.final*TRAV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG005. Pattern: ../immunotyper-output/20x/trav/HG005.final*TRAV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG01123. Pattern: ../immunotyper-output/20x/trav/HG01123.final*TRAV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02109. Pattern: ../immunotyper-output/20x/trav/HG02109.final*TRAV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02486. Pattern: ../immunotyper-output/20x/trav/HG02486.final*TRAV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02559. Pattern: ../immunotyper-output/20x/trav/HG02559.final*TRAV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for NA21309. Pattern: ../immunotyper-output/20x/tra

Loading annotations for TRBV from ../../digger-functional-annotations-all/
Gene type: TRBV
Number of samples: 47
Number of unique functional alleles: 61

Loading annotations for TRGV from ../../digger-functional-annotations-all/


ERROR:benchmark_utils:No genotype calls file found for HG002. Pattern: ../immunotyper-output/20x/trgv/HG002.final*TRGV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG005. Pattern: ../immunotyper-output/20x/trgv/HG005.final*TRGV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG01123. Pattern: ../immunotyper-output/20x/trgv/HG01123.final*TRGV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02109. Pattern: ../immunotyper-output/20x/trgv/HG02109.final*TRGV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02486. Pattern: ../immunotyper-output/20x/trgv/HG02486.final*TRGV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for HG02559. Pattern: ../immunotyper-output/20x/trgv/HG02559.final*TRGV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for NA21309. Pattern: ../immunotyper-output/20x/trg

Gene type: TRGV
Number of samples: 47
Number of unique functional alleles: 9

Loading annotations for TRDV from ../../digger-functional-annotations-all/
Gene type: TRDV
Number of samples: 33
Number of unique functional alleles: 2



ERROR:benchmark_utils:No genotype calls file found for HG02109. Pattern: ../immunotyper-output/20x/trdv/HG02109.final*TRDV_functional_allele_calls.txt
ERROR:benchmark_utils:No genotype calls file found for NA21309. Pattern: ../immunotyper-output/20x/trdv/NA21309.final*TRDV_functional_allele_calls.txt


In [8]:
import pandas as pd
import numpy as np

# Define all gene types
gene_types = ['IGHV', 'IGLV', 'IGKV', 'TRAV', 'TRBV', 'TRGV', 'TRDV']

# Calculate statistics for each gene type
stats = results_df.groupby('gene_type').agg({
    'precision': ['mean', 'median'],
    'recall': ['mean', 'median']
}).round(3)

# Prepare data for DataFrame
df_data = []
latex_table = """\\multirow{2}{*}{Gene type}             & Precision       & Recall          & Precision     & Recall       \\\\
                      & (mean)          & (mean)          & (median)      & (median)     \\\\
\\midrule"""

# Process each gene type
for gene_type in gene_types:
    try:
        # Check if gene type exists in the stats dataframe
        if gene_type in stats.index:
            # Get values for LaTeX table
            precision_mean = stats.loc[gene_type, ('precision', 'mean')]
            recall_mean = stats.loc[gene_type, ('recall', 'mean')]
            precision_median = stats.loc[gene_type, ('precision', 'median')]
            recall_median = stats.loc[gene_type, ('recall', 'median')]
            
            # Add to LaTeX table
            row = f"""
\\textit{{{gene_type}}} & {precision_mean} & {recall_mean} & {precision_median} & {recall_median} \\\\"""
            latex_table += row
            
            # Add to DataFrame data
            df_data.append({
                'gene_type': gene_type,
                'precision_mean': precision_mean,
                'recall_mean': recall_mean,
                'precision_median': precision_median,
                'recall_median': recall_median
            })
        else:
            # Gene type not in data, add row with N/A values
            row = f"""
\\textit{{{gene_type}}} & N/A & N/A & N/A & N/A \\\\"""
            latex_table += row
            
            # Add to DataFrame data with NaN values
            df_data.append({
                'gene_type': gene_type,
                'precision_mean': np.nan,
                'recall_mean': np.nan,
                'precision_median': np.nan,
                'recall_median': np.nan
            })
    except Exception as e:
        # Handle any exceptions (missing data, etc.)
        row = f"""
\\textit{{{gene_type}}} & -- & -- & -- & -- \\\\"""
        latex_table += row
        
        # Add to DataFrame data with NaN values
        df_data.append({
            'gene_type': gene_type,
            'precision_mean': np.nan,
            'recall_mean': np.nan,
            'precision_median': np.nan,
            'recall_median': np.nan
        })
        print(f"Warning: Error processing {gene_type}: {e}")

# Create DataFrame from collected data
results_summary_df = pd.DataFrame(df_data)

# Display both the DataFrame and LaTeX table
print("Results Summary DataFrame:")
display(results_summary_df)

print("\nLaTeX Table:")
print(latex_table)

# Optionally, you can save the DataFrame to CSV
# results_summary_df.to_csv('gene_type_statistics.csv', index=False)

Results Summary DataFrame:


Unnamed: 0,gene_type,precision_mean,recall_mean,precision_median,recall_median
0,IGHV,0.822,0.8,0.821,0.803
1,IGLV,0.955,0.899,0.95,0.901
2,IGKV,0.814,0.72,0.825,0.724
3,TRAV,0.781,0.924,0.777,0.924
4,TRBV,0.873,0.929,0.889,0.938
5,TRGV,0.922,0.832,1.0,0.857
6,TRDV,0.289,0.983,0.292,1.0



LaTeX Table:
\multirow{2}{*}{Gene type}             & Precision       & Recall          & Precision     & Recall       \\
                      & (mean)          & (mean)          & (median)      & (median)     \\
\midrule
\textit{IGHV} & 0.822 & 0.8 & 0.821 & 0.803 \\
\textit{IGLV} & 0.955 & 0.899 & 0.95 & 0.901 \\
\textit{IGKV} & 0.814 & 0.72 & 0.825 & 0.724 \\
\textit{TRAV} & 0.781 & 0.924 & 0.777 & 0.924 \\
\textit{TRBV} & 0.873 & 0.929 & 0.889 & 0.938 \\
\textit{TRGV} & 0.922 & 0.832 & 1.0 & 0.857 \\
\textit{TRDV} & 0.289 & 0.983 & 0.292 & 1.0 \\


# CNV Sensitive

In [6]:
gene_type = 'IGLV'

# Create empty lists to store results
results = []

for sample_path in glob(f'data/assembly-annotations/{gene_type}-reference-annotations/*'):
    sample = os.path.basename(sample_path)
    df, functional_ground_truth_set = load_assembly_annotations(sample, gene_type)
    try:
        genotypes = load_genotype_calls(sample, gene_type)
    except FileNotFoundError:
        continue
    
    tp, fp, fn = calculate_accuracy_metrics(genotypes, functional_ground_truth_set, cnv=True)
    precision, recall = calculate_precision_recall(tp, fp, fn)
    
    # Store results for this sample
    results.append({
        'sample': sample,
        'precision': precision,
        'recall': recall,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'genotypes': list(set(genotypes)),
        'ground_truth': list(set(functional_ground_truth_set)),
        'gene_type': gene_type
    })

# Create DataFrame from results
results_df = pd.DataFrame(results)

In [7]:
gene_type = 'TRAV'

# Create empty lists to store results
results = []

for sample_path in glob(f'data/assembly-annotations/{gene_type}-reference-annotations/*'):
    sample = os.path.basename(sample_path)
    df, functional_ground_truth_set = load_assembly_annotations(sample, gene_type)
    try:
        genotypes = load_genotype_calls(sample, gene_type)
    except FileNotFoundError:
        continue
    
    tp, fp, fn = calculate_accuracy_metrics(genotypes, functional_ground_truth_set, cnv=True)
    precision, recall = calculate_precision_recall(tp, fp, fn)
    
    # Store results for this sample
    results.append({
        'sample': sample,
        'precision': precision,
        'recall': recall,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'genotypes': list(set(genotypes)),
        'ground_truth': list(set(functional_ground_truth_set)),
        'gene_type': gene_type
    })

trav_results_df = pd.DataFrame(results)

# Concatenate the two DataFrames
results_df = pd.concat([results_df, trav_results_df], ignore_index=True)

In [8]:
# Calculate statistics for each gene type
stats = results_df.groupby('gene_type').agg({
    'precision': ['mean', 'median'],
    'recall': ['mean', 'median']
}).round(3)

# Generate LaTeX table
latex_table = f"""\\multirow{{2}}{{*}}{{Gene type}}             & Precision       & Recall          & Precision     & Recall       \\\\
                      & (mean)          & (mean)          & (median)      & (median)     \\\\
\\midrule
\\textit{{IGLV}} & {stats.loc['IGLV', ('precision', 'mean')]}                & {stats.loc['IGLV', ('recall', 'mean')]}                & {stats.loc['IGLV', ('precision', 'median')]}                & {stats.loc['IGLV', ('recall', 'median')]} \\\\
\\textit{{TRAV}} & {stats.loc['TRAV', ('precision', 'mean')]}                & {stats.loc['TRAV', ('recall', 'mean')]}                & {stats.loc['TRAV', ('precision', 'median')]}                & {stats.loc['TRAV', ('recall', 'median')]}\\\\"""

print(latex_table)

KeyError: 'gene_type'