# Label Mapping Analysis

This notebook analyzes the correlation between various NER models (CyNER, SecureBERT) and DNRTI labels to verify if labels should map to multiple DNRTI labels.

The heavy lifting (running the model on 2000 sentences) was done in `analyze_labels.py` to save time.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

def analyze_model_results(model_name):
    csv_file = f"label_correlation_{model_name.lower()}.csv"
    if not os.path.exists(csv_file):
        print(f"Results for {model_name} not found at {csv_file}. Please run 'python analyze_labels.py' first.")
        return
    
    # Load the results generated by the background script
    df_corr = pd.read_csv(csv_file)
    
    # Filter out 'O' to see entity correlations more clearly
    df_entities = df_corr[(df_corr['Predicted'] != 'O') | (df_corr['True'] != 'O')]
    
    # Create heatmap data
    matrix = pd.crosstab(df_entities['Predicted'], df_entities['True'])
    
    plt.figure(figsize=(14, 10))
    sns.heatmap(matrix, annot=True, fmt="d", cmap="YlGnBu")
    plt.title(f"Correlation Heatmap: {model_name} Predicted vs DNRTI True Labels (Sample of 2000 sentences)")
    plt.xlabel("DNRTI Label (Ground Truth)")
    plt.ylabel(f"{model_name} Label (Predicted)")
    plt.show()
    
    print(f"--- Detailed breakdown for {model_name} ---")
    
    # Top labels predicted by the model
    top_preds = df_corr[df_corr['Predicted'] != 'O']['Predicted'].value_counts()
    for pred_label in top_preds.index:
        print(f"\nBreakdown for {model_name} label '{pred_label}':")
        breakdown = df_corr[df_corr['Predicted'] == pred_label]['True'].value_counts()
        print(breakdown)
        print(f"Percentage breakdown:")
        print(breakdown / breakdown.sum() * 100)
    
    print(f"\n--- DNRTI labels missed by {model_name} (Predicted 'O', but DNRTI had entity) ---")
    missed_labels = df_corr[(df_corr['Predicted'] == 'O') & (df_corr['True'] != 'O')]['True'].value_counts()
    print(missed_labels)

# Analyze both models
analyze_model_results("CyNER")
print("\n" + "="*50 + "\n")
analyze_model_results("SecureBERT")