In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import os
from glob import glob
import pandas as pd

In [3]:
from benchmark_utils import *

# No CNV

In [109]:
gene_type = 'IGLV'

# Create empty lists to store results
results = []

for sample_path in glob(f'data/assembly-annotations/{gene_type}-reference-annotations/*'):
    sample = os.path.basename(sample_path)
    df, functional_ground_truth_set = load_assembly_annotations(sample, gene_type)
    try:
        genotypes = load_genotype_calls(sample, gene_type)
    except FileNotFoundError:
        continue
    
    tp, fp, fn = calculate_accuracy_metrics(list(genotypes), list(functional_ground_truth_set))
    precision, recall = calculate_precision_recall(tp, fp, fn)
    
    # Store results for this sample
    results.append({
        'sample': sample,
        'precision': precision,
        'recall': recall,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'genotypes': list(set(genotypes)),
        'ground_truth': list(set(functional_ground_truth_set)),
        'gene_type': gene_type
    })

# Create DataFrame from results
results_df = pd.DataFrame(results)

ERROR:benchmark_utils:No genotype calls file found for HG01965. Pattern: data/genotypes/IGLV/HG01965*/HG01965*IGLV_functional_allele_calls.txt
 'IGLV4-3*unknown_truncated']
 'IGLV(VI)-25-1*unknown_truncated']
 'IGLV(VI)-25-1*unknown_truncated']


In [110]:
gene_type = 'TRAV'

# Create empty lists to store results
results = []

for sample_path in glob(f'data/assembly-annotations/{gene_type}-reference-annotations/*'):
    sample = os.path.basename(sample_path)
    df, functional_ground_truth_set = load_assembly_annotations(sample, gene_type)
    try:
        genotypes = load_genotype_calls(sample, gene_type)
    except FileNotFoundError:
        continue
    
    tp, fp, fn = calculate_accuracy_metrics(list(set(genotypes)), list(set(functional_ground_truth_set)))
    precision, recall = calculate_precision_recall(tp, fp, fn)
    
    # Store results for this sample
    results.append({
        'sample': sample,
        'precision': precision,
        'recall': recall,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'genotypes': list(set(genotypes)),
        'ground_truth': list(set(functional_ground_truth_set)),
        'gene_type': gene_type
    })

trav_results_df = pd.DataFrame(results)

# Concatenate the two DataFrames
results_df = pd.concat([results_df, trav_results_df], ignore_index=True)



In [111]:
# Calculate statistics for each gene type
stats = results_df.groupby('gene_type').agg({
    'precision': ['mean', 'median'],
    'recall': ['mean', 'median']
}).round(3)

# Generate LaTeX table
latex_table = f"""\\multirow{{2}}{{*}}{{Gene type}}             & Precision       & Recall          & Precision     & Recall       \\\\
                      & (mean)          & (mean)          & (median)      & (median)     \\\\
\\midrule
\\textit{{IGLV}} & {stats.loc['IGLV', ('precision', 'mean')]}                & {stats.loc['IGLV', ('recall', 'mean')]}                & {stats.loc['IGLV', ('precision', 'median')]}                & {stats.loc['IGLV', ('recall', 'median')]} \\\\
\\textit{{TRAV}} & {stats.loc['TRAV', ('precision', 'mean')]}                & {stats.loc['TRAV', ('recall', 'mean')]}                & {stats.loc['TRAV', ('precision', 'median')]}                & {stats.loc['TRAV', ('recall', 'median')]}\\\\"""

print(latex_table)

\multirow{2}{*}{Gene type}             & Precision       & Recall          & Precision     & Recall       \\
                      & (mean)          & (mean)          & (median)      & (median)     \\
\midrule
\textit{IGLV} & 0.928                & 0.934                & 0.971                & 0.971 \\
\textit{TRAV} & 0.916                & 0.991                & 0.925                & 1.0\\


# CNV Sensitive

In [114]:
gene_type = 'IGLV'

# Create empty lists to store results
results = []

for sample_path in glob(f'data/assembly-annotations/{gene_type}-reference-annotations/*'):
    sample = os.path.basename(sample_path)
    df, functional_ground_truth_set = load_assembly_annotations(sample, gene_type)
    try:
        genotypes = load_genotype_calls(sample, gene_type)
    except FileNotFoundError:
        continue
    
    tp, fp, fn = calculate_accuracy_metrics(genotypes, functional_ground_truth_set, cnv=True)
    precision, recall = calculate_precision_recall(tp, fp, fn)
    
    # Store results for this sample
    results.append({
        'sample': sample,
        'precision': precision,
        'recall': recall,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'genotypes': list(set(genotypes)),
        'ground_truth': list(set(functional_ground_truth_set)),
        'gene_type': gene_type
    })

# Create DataFrame from results
results_df = pd.DataFrame(results)

ERROR:benchmark_utils:No genotype calls file found for HG01965. Pattern: data/genotypes/IGLV/HG01965*/HG01965*IGLV_functional_allele_calls.txt
 'IGLV4-3*unknown_truncated']
 'IGLV(VI)-25-1*unknown_truncated']
 'IGLV(VI)-25-1*unknown_truncated']


In [115]:
gene_type = 'TRAV'

# Create empty lists to store results
results = []

for sample_path in glob(f'data/assembly-annotations/{gene_type}-reference-annotations/*'):
    sample = os.path.basename(sample_path)
    df, functional_ground_truth_set = load_assembly_annotations(sample, gene_type)
    try:
        genotypes = load_genotype_calls(sample, gene_type)
    except FileNotFoundError:
        continue
    
    tp, fp, fn = calculate_accuracy_metrics(genotypes, functional_ground_truth_set, cnv=True)
    precision, recall = calculate_precision_recall(tp, fp, fn)
    
    # Store results for this sample
    results.append({
        'sample': sample,
        'precision': precision,
        'recall': recall,
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'genotypes': list(set(genotypes)),
        'ground_truth': list(set(functional_ground_truth_set)),
        'gene_type': gene_type
    })

trav_results_df = pd.DataFrame(results)

# Concatenate the two DataFrames
results_df = pd.concat([results_df, trav_results_df], ignore_index=True)



In [116]:
# Calculate statistics for each gene type
stats = results_df.groupby('gene_type').agg({
    'precision': ['mean', 'median'],
    'recall': ['mean', 'median']
}).round(3)

# Generate LaTeX table
latex_table = f"""\\multirow{{2}}{{*}}{{Gene type}}             & Precision       & Recall          & Precision     & Recall       \\\\
                      & (mean)          & (mean)          & (median)      & (median)     \\\\
\\midrule
\\textit{{IGLV}} & {stats.loc['IGLV', ('precision', 'mean')]}                & {stats.loc['IGLV', ('recall', 'mean')]}                & {stats.loc['IGLV', ('precision', 'median')]}                & {stats.loc['IGLV', ('recall', 'median')]} \\\\
\\textit{{TRAV}} & {stats.loc['TRAV', ('precision', 'mean')]}                & {stats.loc['TRAV', ('recall', 'mean')]}                & {stats.loc['TRAV', ('precision', 'median')]}                & {stats.loc['TRAV', ('recall', 'median')]}\\\\"""

print(latex_table)

\multirow{2}{*}{Gene type}             & Precision       & Recall          & Precision     & Recall       \\
                      & (mean)          & (mean)          & (median)      & (median)     \\
\midrule
\textit{IGLV} & 0.902                & 0.758                & 0.937                & 0.762 \\
\textit{TRAV} & 0.883                & 0.875                & 0.874                & 0.882\\
