In [None]:
# ===================================================================
# 2_analyse_results.ipynb
#
# This notebook loads the results from the benchmark runs, performs
# detailed analysis on performance and accuracy, and generates
# publication-ready tables and figures.
# ===================================================================

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import yaml
import time
import warnings
from pathlib import Path
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    cohen_kappa_score
)

# --- Matplotlib and Seaborn Configuration ---
sns.set_theme(style="whitegrid", context="talk")
plt.rcParams['figure.figsize'] = (12, 7)
plt.rcParams['figure.dpi'] = 100

# --- Warnings Management ---
warnings.simplefilter("ignore")   #to disable warnings
#warnings.simplefilter("default") #to enable warnings
#warnings.filterwarnings('ignore', category=UserWarning, module='matplotlib') #for matplotlib only

print("INFO: Imports and settings loaded successfully.")


In [None]:
# === Analysis Configuration ===

# A list of run_id strings to be combined and analysed.
RUN_IDS_TO_ANALYSE = ["run_20250814_150749", "run_20250814_211348", "run_20250814_154132"] # Different runs (e.g. GPT, Gemini, and Ollama) can be imported separately

MODELS_TO_EXCLUDE = ["gemini-2.5-flash-lite"] # a model excluded from the analysis for the time being (significantly faster than any other but not reliable)

# The ground truth to use for accuracy and F1-score calculations.
GOLDEN_DATASET = "gpt-5"
"""
Choose the source for the 'golden standard' labels.
Options:
- "Manual": Use the manually labelled golden dataset file. (Measures true accuracy).
- "Consensus": Use the majority vote among all models as the ground truth. (Measures agreement with the crowd).
- "<model_name>": Use a specific model's output (e.g., "gpt-5" as the best performing one) as the ground truth. (Measures alignment with that model).
- None: Skip the accuracy-based analysis.
"""

# --- Add paths to configuration files ---
# Path to the config file containing pricing and model info
CONFIG_FILE = "configurations/config.yaml"
# Path to the original dataset (used if manual labels are needed)
MANUAL_LABELS_FILE = "input_dataset.csv" # columns with names 'Golden_Maint_Type' and 'Golden_Issue_Cat' would be used as the ground truth if present
# Path to the initial prompt and labels
PROMPTS_FILE = "prompts.json"


# Generate a unique ID for this specific analysis execution
analysis_id = f"analysis_{time.strftime('%Y%m%d_%H%M%S')}"
# Define the path to the unique output directory
ANALYSIS_OUTPUT_DIR = Path("outputs/analysis_results") / analysis_id
# Create the directory
ANALYSIS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"INFO: Analysis outputs will be saved to: {ANALYSIS_OUTPUT_DIR}")


# --- Helper function to load the config ---
def load_config(config_path="config.yaml"):
    """Loads the YAML configuration file."""
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

# --- Load the config file into memory ---
try:
    config = load_config(CONFIG_FILE)
    print(f"INFO: Configuration loaded from '{CONFIG_FILE}'.")
except FileNotFoundError:
    print(f"ERROR: Configuration file is not found at '{CONFIG_FILE}'. The default config.yaml is absent. Aborting.")
    raise

In [None]:
# === Data Loading and Merging ===

def load_and_merge_runs_safely(run_ids: list, base_data_file: str) -> pd.DataFrame:
    """
    Builds a clean master DataFrame using a memory-safe strategy that
    de-duplicates all data sources before merging to prevent memory explosion which happened in earlier versions.
    """
    if not run_ids:
        raise ValueError("ERROR: run_ids list cannot be empty.")

    # 1. Load the base data and immediately de-duplicate it.
    try:
        master_df = pd.read_csv(base_data_file)
        master_df.drop_duplicates(subset=['WONUM'], keep='first', inplace=True)
        print(f"INFO: Loaded and de-duplicated clean base data from '{base_data_file}'.")
    except FileNotFoundError:
        raise FileNotFoundError(f"The base data file '{base_data_file}' was not found.")

    loaded_models = set()
    
    # 2. Loop through each run file.
    for run_id in run_ids:
        run_path = Path("outputs") / "runs" / run_id / "Master_Benchmark_Results.csv"
        if not run_path.exists():
            print(f"WARNING: Results file for run_id '{run_id}' not found. Skipping.")
            continue
            
        print(f"INFO: Processing run '{run_id}'...")
        run_df = pd.read_csv(run_path, on_bad_lines='skip')
        
        # 3. De-duplicate this run's data immediately after loading
        run_df.drop_duplicates(subset=['WONUM'], keep='first', inplace=True)
        
        # Identify models present in this run file
        models_in_run = sorted(list(set([col.split('_Maint_Type')[0] for col in run_df.columns if '_Maint_Type' in col])))
        
        for model_name in models_in_run:
            if model_name in loaded_models:
                raise ValueError(f"Duplicate model '{model_name}' found. Please provide run_ids with unique models.")
            
            # Grab only the columns for this specific model
            cols_for_this_model = [col for col in run_df.columns if col.startswith(model_name + '_')]
            model_data_to_merge = run_df[['WONUM'] + cols_for_this_model]
            
            # 4. Merge the lean, de-duplicated model data into the master DataFrame.
            master_df = pd.merge(master_df, model_data_to_merge, on="WONUM", how="left")
            loaded_models.add(model_name)
            
    print("\nINFO: All run data successfully loaded and merged.")
    return master_df

# --- Load the data using the safe function ---
master_df = load_and_merge_runs_safely(RUN_IDS_TO_ANALYSE, MANUAL_LABELS_FILE)
print("INFO: Raw master DataFrame loaded. Cleanup will be performed in the next cell.")

In [None]:
# === Final Data Cleaning and Preparation ===

# 1. Drop unnecessary columns to reduce memory footprint
cols_to_drop = ['Farm ID', 'Farm Name', 'Farm Number', 'Turbine No', 'Age at Event', 'Event Date'] # Add any other columns that are not needed for analysis
master_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
print("INFO: Dropped unnecessary columns.")

# 2. Clean up suffixes created by the merge
cols_x = [col for col in master_df.columns if col.endswith('_x')]
if cols_x:
    print(f"INFO: Found {len(cols_x)} columns with '_x' suffix. Consolidating...")
    cols_to_drop_after_cleanup = []
    for col_x in cols_x:
        base_name = col_x[:-2]
        col_y = base_name + '_y'
        
        if col_y in master_df.columns:
            master_df[base_name] = master_df[col_x].combine_first(master_df[col_y])
            cols_to_drop_after_cleanup.extend([col_x, col_y])
        else:
            master_df.rename(columns={col_x: base_name}, inplace=True)
            
    master_df.drop(columns=cols_to_drop_after_cleanup, inplace=True, errors='ignore')
    print("INFO: Suffix cleanup complete.")

# 3. De-duplicate the DataFrame to ensure accurate sums
initial_rows = len(master_df)
master_df.drop_duplicates(subset=['WONUM'], keep='first', inplace=True)
final_rows = len(master_df)
if initial_rows > final_rows:
    print(f"INFO: Removed {initial_rows - final_rows} duplicate rows based on 'WONUM'.")

# Finalize the list of model names, applying the exclusion list
all_models_found = sorted([col.split('_Maint_Type')[0] for col in master_df.columns if '_Maint_Type' in col])
model_names = [model for model in all_models_found if model not in MODELS_TO_EXCLUDE]

print(f"INFO: Found {len(all_models_found)} total models in the data.")
if MODELS_TO_EXCLUDE:
    print(f"INFO: Excluding {len(MODELS_TO_EXCLUDE)} models: {MODELS_TO_EXCLUDE}")
print(f"INFO: Final {len(model_names)} models for analysis: {model_names}")

# Display a sample of the cleaned, merged data
display(master_df.head())

In [None]:
# === Ground Truth Preparation ===

run_accuracy_analysis = True
ground_truth_source = "" # This variable stores a descriptive name for the ground truth source to use in table/figure titles.

if GOLDEN_DATASET is None:
    run_accuracy_analysis = False
    print("INFO: `GOLDEN_DATASET` is None. Skipping accuracy-based analysis.")

elif GOLDEN_DATASET == "Manual":
    print("INFO: Using 'Manual' labels as the ground truth.")
    try:
        manual_df = pd.read_csv(MANUAL_LABELS_FILE)
        ground_truth_df = manual_df[['WONUM', 'Golden_Maint_Type', 'Golden_Issue_Cat']].copy()
        ground_truth_source = "Manual Labels"
        
        # De-duplicate the manual labels to ensure a clean merge.
        initial_rows = len(ground_truth_df)
        ground_truth_df.drop_duplicates(subset=['WONUM'], keep='first', inplace=True)
        if len(ground_truth_df) < initial_rows:
            print(f"INFO: Removed {initial_rows - len(ground_truth_df)} duplicate entries from the manual labels file.")

        # The merge is now safe
        master_df = pd.merge(master_df, ground_truth_df, on="WONUM", how="left")
        
    except Exception as e:
        print(f"ERROR: Failed to load or merge manual labels. {e}")
        run_accuracy_analysis = False

elif GOLDEN_DATASET == "Consensus":
    print("INFO: Using 'Consensus' (majority vote) as the ground truth.")
    maint_type_cols = [f"{m}_Maint_Type" for m in model_names]
    issue_cat_cols = [f"{m}_Issue_Cat" for m in model_names]
    
    # --- Create columns directly instead of merging ---
    master_df['Golden_Maint_Type'] = master_df[maint_type_cols].mode(axis=1)[0]
    master_df['Golden_Issue_Cat'] = master_df[issue_cat_cols].mode(axis=1)[0]
    ground_truth_source = "Consensus Label"

else: # A specific model name was given
    model_name = GOLDEN_DATASET
    print(f"INFO: Using '{model_name}' output as the ground truth.")
    maint_col = f"{model_name}_Maint_Type"
    issue_col = f"{model_name}_Issue_Cat"
    if maint_col not in master_df.columns or issue_col not in master_df.columns:
        print(f"ERROR: Model '{model_name}' not found in the loaded data. Cannot use as ground truth.")
        run_accuracy_analysis = False
    else:
        # --- Create columns directly instead of merging ---
        master_df['Golden_Maint_Type'] = master_df[maint_col]
        master_df['Golden_Issue_Cat'] = master_df[issue_col]
        ground_truth_source = f"{model_name} Labels"

if run_accuracy_analysis:
    print(f"INFO: Ground truth from '{ground_truth_source}' prepared successfully.")
    # Drop rows where ground truth is missing, as they cannot be evaluated
    master_df.dropna(subset=['Golden_Maint_Type', 'Golden_Issue_Cat'], inplace=True)
    print(f"INFO: After dropping rows with missing ground truth, {len(master_df)} rows remain for analysis.")

In [None]:
# === Data Validation: Check for Invalid (Hallucinated) Labels ===

print("Data Validation: Checking for Invalid Labels and Errors\n")

# 1. Get the master list of all valid labels from the prompts file
with open(PROMPTS_FILE, "r") as f:
    prompts_config = json.load(f)

valid_maint_types = set(prompts_config["maintenance_types"].keys())
valid_issue_cats = set(prompts_config["issue_categories"].keys())
technical_error_strings = {"CLIENT_ERROR", "FORMATTING_ERROR"}

any_issue_found = False
# 2. Loop through each model and check its output
for model in model_names:
    model_had_issue = False
    
    # Get all unique labels produced by this model
    produced_maint_types = set(master_df[f"{model}_Maint_Type"].unique())
    produced_issue_cats = set(master_df[f"{model}_Issue_Cat"].unique())

    # --- Check for technical errors ---
    found_maint_techn_errors = produced_maint_types.intersection(technical_error_strings)
    if found_maint_techn_errors:
        any_issue_found = model_had_issue = True
        print(f"🔵 INFO: Model '{model}' contains technical Maintenance Type errors: {', '.join(found_maint_techn_errors)}")

    found_issue_techn_errors = produced_issue_cats.intersection(technical_error_strings)
    if found_issue_techn_errors:
        any_issue_found = model_had_issue = True
        print(f"🔵 INFO: Model '{model}' contains technical Issue Category errors: {', '.join(found_issue_techn_errors)}")

    # --- Check for content errors (hallucinations) ---
    invalid_maint = produced_maint_types - valid_maint_types - technical_error_strings
    invalid_issue = produced_issue_cats - valid_issue_cats - technical_error_strings

    if invalid_maint:
        any_issue_found = model_had_issue = True
        print(f"🔴 WARNING: Model '{model}' produced {len(invalid_maint)} invalid Maintenance Types:")
        print(f"   - {', '.join(str(i) for i in invalid_maint if pd.notna(i))}")
        
    if invalid_issue:
        any_issue_found = model_had_issue = True
        print(f"🔴 WARNING: Model '{model}' produced {len(invalid_issue)} invalid Issue Categories:")
        print(f"   - {', '.join(str(i) for i in invalid_issue if pd.notna(i))}")
        
    if model_had_issue:
        print("") # Add a newline for readability between models with issues

if not any_issue_found:
    print("🟢 All models adhered to the provided labels and produced no errors.")

In [None]:
# === Performance Metrics Analysis ===

print("Table 1: Overall Performance Summary")

# Get the master list of valid labels once before the loop
valid_maint_types = set(prompts_config["maintenance_types"].keys())
valid_issue_cats = set(prompts_config["issue_categories"].keys())

perf_results = []
for model in model_names:
    # --- Comprehensive Error Rate Calculation ---
    maint_col = master_df[f"{model}_Maint_Type"]
    
    # 1. Count technical errors (CLIENT_ERROR, FORMATTING_ERROR)
    technical_errors = maint_col.isin(["CLIENT_ERROR", "FORMATTING_ERROR"])
    
    # 2. Count content errors (hallucinated labels)
    # A row is a content error if it's NOT a technical error AND it's NOT a valid label
    content_errors = ~technical_errors & ~maint_col.isin(valid_maint_types)
    
    # 3. Total error is the sum of both types
    total_error_count = technical_errors.sum() + content_errors.sum()
    total_logs = len(master_df)
    error_rate = (total_error_count / total_logs) * 100 if total_logs > 0 else 0
    
    # Tokens and Cost
    prompt_tokens = master_df[f"{model}_Prompt_Tokens"].sum()
    comp_tokens = master_df[f"{model}_Comp_Tokens"].sum()
    
    # Find model config to get pricing
    config_model_name = model.replace('_', ':')
    model_cfg = next((item for item in config["models_to_benchmark"] if item["model_name"] == config_model_name), None)
    
    if model_cfg is None:
        print(f"WARNING: No configuration found for model '{model}' in {CONFIG_FILE}. Skipping performance calculation.")
        continue

    price_input = model_cfg.get("pricing", {}).get("individual", {}).get("input", 0.0)
    price_output = model_cfg.get("pricing", {}).get("individual", {}).get("output", 0.0)
    cost = ((prompt_tokens / 1_000_000) * price_input) + ((comp_tokens / 1_000_000) * price_output)

    # Throughput
    perf_log = pd.read_csv("outputs/performance_log.csv")
    model_perf_rows = perf_log[perf_log["Model"] == config_model_name]
    if not model_perf_rows.empty:
        model_perf = model_perf_rows.iloc[-1]
        throughput = model_perf["Performance (logs/s)"]
    else:
        throughput = 0

    perf_results.append({
        "Model": model,
        "Throughput (logs/s)": throughput,
        "Total Tokens": prompt_tokens + comp_tokens,
        "Estimated Cost ($)": cost,
        "Error Rate (%)": error_rate
    })

perf_summary_df = pd.DataFrame(perf_results).set_index("Model")
display(perf_summary_df.style.format({
    "Throughput (logs/s)": "{:.2f}",
    "Estimated Cost ($)": "${:.4f}",
    "Error Rate (%)": "{:.2f}%"
}))

# --- Figure 1: Throughput (Processing Speed) ---
print("\nFigure 1: Processing Speed Comparison")
plt.figure(figsize=(12, 7))
perf_summary_df.sort_values("Throughput (logs/s)", ascending=False)['Throughput (logs/s)'].plot(
    kind='bar', color=sns.color_palette("viridis", len(perf_summary_df))
)
plt.title("Model Throughput (Processing Speed)")
plt.ylabel("Logs Processed per Second")
plt.xlabel("Model")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
#plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_1.png", dpi=300, bbox_inches='tight')
plt.show()



# --- Figure 2: Estimated Cost ---
print("\nFigure 2: Estimated Cost Comparison")

# Filter the dataframe to only include models with a cost greater than zero
cost_df = perf_summary_df[perf_summary_df["Estimated Cost ($)"] > 0]

if not cost_df.empty:
    plt.figure(figsize=(12, 7))
    cost_df.sort_values("Estimated Cost ($)", ascending=False)['Estimated Cost ($)'].plot(
        kind='bar', color=sns.color_palette("plasma", len(cost_df))
    )
    plt.title("Estimated Cost to Process Dataset (API Models Only)")
    plt.ylabel("Cost (USD)")
    plt.xlabel("Model")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    #plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_2.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("INFO: No models with a non-zero cost were found to plot.")

In [None]:
# === Classification Accuracy Analysis ===

if not run_accuracy_analysis:
    print("INFO: Skipping Classification Accuracy Analysis as `GOLDEN_DATASET` was not set to a valid source.")
else:
    print(f"Table 2: Classification Metrics (vs. {ground_truth_source})")
    
    # Create a new list of models to evaluate, excluding the ground truth model if it's one of them.
    models_to_evaluate = model_names.copy()
    if GOLDEN_DATASET in models_to_evaluate:
        print(f"INFO: Excluding '{GOLDEN_DATASET}' from the comparison as it is the selected ground truth.")
        models_to_evaluate.remove(GOLDEN_DATASET)
    
    accuracy_results = []

    # The loop now iterates over the filtered list
    for model in models_to_evaluate:
        # Filter out error rows for this model before calculating accuracy
        eval_df = master_df[~master_df[f"{model}_Maint_Type"].isin(["CLIENT_ERROR", "FORMATTING_ERROR"])].copy()
        
        # Drop rows with NaNs in either the prediction or true label for fair comparison
        eval_df.dropna(subset=['Golden_Maint_Type', f"{model}_Maint_Type"], inplace=True)
        y_true_maint = eval_df['Golden_Maint_Type']
        y_pred_maint = eval_df[f"{model}_Maint_Type"]
        maint_f1 = f1_score(y_true_maint, y_pred_maint, average='weighted', zero_division=0)
        maint_acc = accuracy_score(y_true_maint, y_pred_maint)

        # Do the same for Issue Category
        eval_df.dropna(subset=['Golden_Issue_Cat', f"{model}_Issue_Cat"], inplace=True)
        y_true_issue = eval_df['Golden_Issue_Cat']
        y_pred_issue = eval_df[f"{model}_Issue_Cat"]
        issue_f1 = f1_score(y_true_issue, y_pred_issue, average='weighted', zero_division=0)
        issue_acc = accuracy_score(y_true_issue, y_pred_issue)
        
        accuracy_results.append({
            "Model": model,
            "Maint. Type F1": maint_f1,
            "Maint. Type Acc.": maint_acc,
            "Issue Cat. F1": issue_f1,
            "Issue Cat. Acc.": issue_acc
        })
        
    accuracy_summary_df = pd.DataFrame(accuracy_results).set_index("Model")
    if not accuracy_summary_df.empty:
        display(accuracy_summary_df.style.format("{:.3f}"))

        # --- Figure 3: F1-Score Comparison ---
        print(f"\nFigure 3: F1-Score Comparison (vs. {ground_truth_source})")
        accuracy_summary_df[['Maint. Type F1', 'Issue Cat. F1']].sort_values('Maint. Type F1', ascending=False).plot(
            kind='bar', figsize=(14, 7)
        )
        plt.title("Weighted F1-Scores for Classification Tasks")
        plt.ylabel("F1-Score")
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0, 1)
        plt.legend(title="Task")
        plt.tight_layout()
        #plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_3.png", dpi=300, bbox_inches='tight')
        plt.show()

        # --- Figure 4: Confusion Matrices (Separate Figures) ---
        print(f"\nFigure 4: Confusion Matrices (vs. {ground_truth_source})")
        top_3_models = accuracy_summary_df.sort_values('Maint. Type F1', ascending=False).head(3).index

        for model in top_3_models:
            print(f"\n--- Generating confusion matrices for: {model} ---")
            
            # --- Plot 1: Maintenance Type ---
            plt.figure(figsize=(16, 14)) # Create a dedicated figure for this plot
            
            temp_df_maint = master_df[['Golden_Maint_Type', f"{model}_Maint_Type"]].dropna()
            maint_labels = pd.unique(temp_df_maint[['Golden_Maint_Type', f"{model}_Maint_Type"]].values.ravel('K'))
            cm_maint = confusion_matrix(temp_df_maint['Golden_Maint_Type'], temp_df_maint[f"{model}_Maint_Type"], labels=maint_labels)
            
            sns.heatmap(cm_maint, annot=True, fmt='d', cmap='Blues', 
                        xticklabels=maint_labels, yticklabels=maint_labels,
                        annot_kws={"size": 10})
                        
            plt.title(f"Maintenance Type Confusion Matrix for: {model}", fontsize=18)
            plt.ylabel("True Label", fontsize=14)
            plt.xlabel("Predicted Label", fontsize=14)
            plt.xticks(rotation=45, ha='right')
            plt.yticks(rotation=0)
            plt.tight_layout()
            #plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_4_1.png", dpi=300, bbox_inches='tight')
            plt.show()
            
            
            # --- Plot 2: Issue Category ---
            plt.figure(figsize=(22, 18)) # Create a second, larger figure for this plot
            
            temp_df_issue = master_df[['Golden_Issue_Cat', f"{model}_Issue_Cat"]].dropna()
            issue_labels = pd.unique(temp_df_issue[['Golden_Issue_Cat', f"{model}_Issue_Cat"]].values.ravel('K'))
            cm_issue = confusion_matrix(temp_df_issue['Golden_Issue_Cat'], temp_df_issue[f"{model}_Issue_Cat"], labels=issue_labels)

            sns.heatmap(cm_issue, annot=True, fmt='d', cmap='Reds',
                    xticklabels=issue_labels, yticklabels=issue_labels,
                    annot_kws={"size": 10})

            plt.title(f"Issue Category Confusion Matrix for: {model}", fontsize=18)
            plt.ylabel("Assumed Ground Truth Label", fontsize=14)
            plt.xlabel("Predicted Label", fontsize=14)
            plt.xticks(rotation=45, ha='right')
            plt.yticks(rotation=0)
            plt.tight_layout()
            #plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_4_2.png", dpi=300, bbox_inches='tight')
            plt.show()
    else:
        print("INFO: No models available for accuracy comparison after excluding the ground truth model.")

In [None]:
# === Confidence Level Analysis ===

print("Figure 5: Model Self-Reported Confidence Distribution")

confidence_cols = [f"{m}_Certainty" for m in model_names]
confidence_df = master_df[confidence_cols].copy()
confidence_df.columns = model_names

# Calculate percentage distribution
confidence_dist = confidence_df.apply(lambda x: x.value_counts(normalize=True)).T
confidence_dist = confidence_dist.reindex(columns=['High', 'Medium', 'Low']).fillna(0) 

confidence_dist.plot(
    kind='bar', stacked=True, figsize=(14, 7),
    color=['#2ca02c', '#ff7f0e', '#d62728'] # Green, Orange, Red
)
plt.title("Distribution of Self-Reported Confidence Levels")
plt.ylabel("Proportion of Predictions")
plt.xlabel("Model")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Confidence Level")
plt.tight_layout()
#plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_5.png", dpi=300, bbox_inches='tight')
plt.show()


# --- F1-Score vs. Confidence ---
if run_accuracy_analysis:
    print(f"\nTables 3 & 4 and Figures 6 & 7: F1-Score vs. Confidence (Ground Truth: {ground_truth_source})")

    confidence_accuracy_results = []
    for model in model_names:
        if model == GOLDEN_DATASET:
            continue
            
        for level in ['High', 'Medium', 'Low']:
            subset_df = master_df[master_df[f"{model}_Certainty"] == level]
            if not subset_df.empty:
                # Calculate F1 for Maintenance Type
                df_maint = subset_df.dropna(subset=['Golden_Maint_Type', f"{model}_Maint_Type"])
                f1_maint = f1_score(df_maint['Golden_Maint_Type'], df_maint[f"{model}_Maint_Type"], average='weighted', zero_division=0)
                
                # --- Calculate F1 for Issue Category ---
                df_issue = subset_df.dropna(subset=['Golden_Issue_Cat', f"{model}_Issue_Cat"])
                f1_issue = f1_score(df_issue['Golden_Issue_Cat'], df_issue[f"{model}_Issue_Cat"], average='weighted', zero_division=0)
                
                confidence_accuracy_results.append({
                    "Model": model,
                    "Confidence": level,
                    "F1 Score (Maint. Type)": f1_maint,
                    "F1 Score (Issue Cat.)": f1_issue # Add to results
                })
    
    if confidence_accuracy_results:
        conf_acc_df = pd.DataFrame(confidence_accuracy_results)
        
        # --- Create and display table/plot for Maintenance Type ---
        print("\nTable 3: F1-Score (Maint. Type) vs. Confidence")
        conf_acc_pivot_maint = conf_acc_df.pivot(index='Model', columns='Confidence', values='F1 Score (Maint. Type)')
        display(conf_acc_pivot_maint.style.format("{:.3f}", na_rep="-").background_gradient(cmap='viridis'))
        
        conf_acc_pivot_maint.plot(kind='bar', figsize=(14,7))
        plt.title(f"F1-Score (Maint. Type) Stratified by Confidence (vs. {ground_truth_source})")
        plt.ylabel("Weighted F1-Score")
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0,1)
        #plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_6.png", dpi=300, bbox_inches='tight')
        plt.show()
        

        # --- Create and display table/plot for Issue Category ---
        print("\nTable 4: F1-Score (Issue Cat.) vs. Confidence")
        conf_acc_pivot_issue = conf_acc_df.pivot(index='Model', columns='Confidence', values='F1 Score (Issue Cat.)')
        display(conf_acc_pivot_issue.style.format("{:.3f}", na_rep="-").background_gradient(cmap='plasma'))

        conf_acc_pivot_issue.plot(kind='bar', figsize=(14,7))
        plt.title(f"F1-Score (Issue Cat.) Stratified by Confidence (vs. {ground_truth_source})")
        plt.ylabel("Weighted F1-Score")
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0,1)
        #plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_7.png", dpi=300, bbox_inches='tight')
        plt.show()
        
        
    else:
        print("INFO: No data available for confidence vs. accuracy analysis.")
else:
    print("INFO: Skipping F1-Score vs. Confidence analysis because no ground truth is set.")

In [None]:
# === Consensus and Inter-Model Agreement ===

print("Table 5: Model Agreement with Consensus")

maint_type_cols = [f"{m}_Maint_Type" for m in model_names]
issue_cat_cols = [f"{m}_Issue_Cat" for m in model_names]

# Create temporary Series for the consensus labels
consensus_maint = master_df[maint_type_cols].mode(axis=1)[0]
consensus_issue = master_df[issue_cat_cols].mode(axis=1)[0]

agreement_results = []
for model in model_names:
    # Create a temporary dataframe with the necessary columns
    temp_df = pd.DataFrame({
        'consensus_maint': consensus_maint,
        'model_maint': master_df[f"{model}_Maint_Type"],
        'consensus_issue': consensus_issue,
        'model_issue': master_df[f"{model}_Issue_Cat"]
    }).dropna() # Drop rows where any value is NaN

    # Calculate scores on the cleaned data
    agree_maint = accuracy_score(temp_df['consensus_maint'], temp_df['model_maint'])
    agree_issue = accuracy_score(temp_df['consensus_issue'], temp_df['model_issue'])
    
    agreement_results.append({
        "Model": model,
        "Agreement with Consensus (Maint.)": agree_maint,
        "Agreement with Consensus (Issue)": agree_issue
    })

agreement_df = pd.DataFrame(agreement_results).set_index("Model")
display(agreement_df.style.format("{:.2%}"))

# --- Figure 8: Inter-Model Agreement Heatmap (Maintenance Type) ---
print("\nFigure 8: Inter-Model Agreement Heatmap (Maintenance Type)")

kappa_matrix = pd.DataFrame(index=model_names, columns=model_names, dtype=float)
for model1 in model_names:
    for model2 in model_names:
        
        # If we are comparing a model to itself, the score is always a perfect 1.0
        if model1 == model2:
            kappa_matrix.loc[model1, model2] = 1.0
            continue # Skip to the next comparison

        # Create a temporary df and drop NaNs for the two different models
        temp_kappa_df = master_df[[f"{model1}_Maint_Type", f"{model2}_Maint_Type"]].dropna()
        
        if not temp_kappa_df.empty:
            # Force to string type AND extract the raw NumPy array with .values
            y1 = temp_kappa_df[f"{model1}_Maint_Type"].astype(str).values
            y2 = temp_kappa_df[f"{model2}_Maint_Type"].astype(str).values
            
            kappa_matrix.loc[model1, model2] = cohen_kappa_score(y1, y2)
        else:
            # If there's no overlap, the agreement score is undefined (NaN)
            kappa_matrix.loc[model1, model2] = np.nan

plt.figure(figsize=(12, 10))
sns.heatmap(kappa_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Pairwise Inter-Model Agreement (Cohen's Kappa for Maintenance Type)")
#plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_8.png", dpi=300, bbox_inches='tight')
plt.show()

# --- Figure 9: Inter-Model Agreement Heatmap (Issue Category) ---
print("\nFigure 9: Inter-Model Agreement Heatmap (Issue Category)")

# Create a new DataFrame for the issue category results
kappa_matrix_issue = pd.DataFrame(index=model_names, columns=model_names, dtype=float)

for model1 in model_names:
    for model2 in model_names:
        
        if model1 == model2:
            kappa_matrix_issue.loc[model1, model2] = 1.0
            continue

        # Select the '_Issue_Cat' columns instead of '_Maint_Type'
        temp_kappa_df = master_df[[f"{model1}_Issue_Cat", f"{model2}_Issue_Cat"]].dropna()
        
        if not temp_kappa_df.empty:
            y1 = temp_kappa_df[f"{model1}_Issue_Cat"].astype(str).values
            y2 = temp_kappa_df[f"{model2}_Issue_Cat"].astype(str).values
            
            kappa_matrix_issue.loc[model1, model2] = cohen_kappa_score(y1, y2)
        else:
            kappa_matrix_issue.loc[model1, model2] = np.nan

plt.figure(figsize=(12, 10))
# Use the new kappa_matrix_issue DataFrame and a different color map
sns.heatmap(kappa_matrix_issue, annot=True, cmap='viridis', fmt='.2f')
plt.title("Pairwise Inter-Model Agreement (Cohen's Kappa for Issue Category)")
#plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_9.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# === Summary and Export ===

print("INFO: Saving all generated tables...")

try:
    # --- Save Tables ---
    perf_summary_df.to_csv(ANALYSIS_OUTPUT_DIR / "table_1_performance_summary.csv")
    
    if run_accuracy_analysis:
        accuracy_summary_df.to_csv(ANALYSIS_OUTPUT_DIR / "table_2_accuracy_summary.csv")
        
        # Check if the confidence analysis was run and the pivot tables exist
        if 'conf_acc_pivot_maint' in locals():
            conf_acc_pivot_maint.to_csv(ANALYSIS_OUTPUT_DIR / "table_3_f1_vs_confidence_maint.csv")
        if 'conf_acc_pivot_issue' in locals():
            conf_acc_pivot_issue.to_csv(ANALYSIS_OUTPUT_DIR / "table_4_f1_vs_confidence_issue.csv")

    agreement_df.to_csv(ANALYSIS_OUTPUT_DIR / "table_5_consensus_agreement.csv")

    print(f"\nAnalysis complete. All tables and figures saved to: {ANALYSIS_OUTPUT_DIR}")

except NameError as e:
    print(f"\nERROR: A DataFrame was not found, so not all tables could be saved. Please check for errors in previous cells. Details: {e}")

In [None]:
# ===================================================================
# === Addition: Visualisations used in the Academic Paper ===
# ===================================================================

print("--- Generating Curated Tables and Figures for Publication ---")

# Define the desired, consistent order for models in all assets
PAPER_MODEL_ORDER = [
    'gpt-5', 'gpt-5-mini', 'gpt-5-nano', 'o3', 'o4-mini', 
    'gemini-2.5-pro', 'gemini-2.5-flash', 
    'phi4_14b', 'gemma3_12b', 'llama3.1_8b', 'mistral_7b'
]

# --- 1. Final Summary Table: Overall Performance and Alignment ---
print("\n## Final Summary Table: Overall Performance and Alignment ##")

# Create a new DataFrame starting with the performance summary
final_summary_df = perf_summary_df.copy()

# Calculate Average F1 Score and add it
if GOLDEN_DATASET in model_names and GOLDEN_DATASET not in accuracy_summary_df.index:
    accuracy_summary_df.loc[GOLDEN_DATASET] = {"Maint. Type F1": 1.0, "Issue Cat. F1": 1.0}
accuracy_summary_df['Avg F1 Score'] = accuracy_summary_df[['Maint. Type F1', 'Issue Cat. F1']].mean(axis=1)
final_summary_df = final_summary_df.join(accuracy_summary_df[['Avg F1 Score']])

# Calculate Average Consensus Agreement and add it
agreement_df['Avg Consensus Agreement'] = agreement_df[['Agreement with Consensus (Maint.)', 'Agreement with Consensus (Issue)']].mean(axis=1)
final_summary_df = final_summary_df.join(agreement_df[['Avg Consensus Agreement']])

# Reorder and display the final table according to the specified order
final_summary_df = final_summary_df.reindex(PAPER_MODEL_ORDER).dropna(how='all')

display(final_summary_df.style.format({
    "Throughput (logs/s)": "{:.2f}",
    "Estimated Cost ($)": "${:.2f}",
    "Error Rate (%)": "{:.2f}%",
    "Avg F1 Score": "{:.2f}",
    "Avg Consensus Agreement": "{:.2%}"
}))
# Save the final summary table
final_summary_df.to_csv(ANALYSIS_OUTPUT_DIR / "final_summary_table.csv")


# --- 2. Combined Chart: Throughput vs. Cost (Tornado Chart) ---
print("\n## Figure: Throughput vs. Cost Trade-off ##")

# Order by throughput (include ALL models)
throughput_df = perf_summary_df.sort_values('Throughput (logs/s)')
ordered_index = throughput_df.index

# Align costs to the same order; zero for missing/locals
cost_series = perf_summary_df['Estimated Cost ($)'].reindex(ordered_index).fillna(0)
is_local = cost_series.eq(0)

fig, axes = plt.subplots(figsize=(12, 8), ncols=2, sharey=True)
fig.suptitle('Throughput vs. Cost for API and Local Models', fontsize=18)

# Left: Throughput (all models). Dim local (zero-cost) bars.
left_bars = axes[0].barh(ordered_index, throughput_df['Throughput (logs/s)'], color='seagreen')
axes[0].set_xlabel('Throughput (logs/s)')
axes[0].invert_xaxis()

for i, rect in enumerate(left_bars.patches):
    if is_local.iloc[i]:
        rect.set_alpha(0.4)

# Right: Cost (aligned to same order)
right_bars = axes[1].barh(ordered_index, cost_series, color='crimson')
axes[1].set_xlabel('Estimated Cost ($)')

for i, rect in enumerate(right_bars.patches):
    if is_local.iloc[i]:
        rect.set_alpha(0.4)
        rect.set_hatch('///')
        rect.set_edgecolor('dimgray')
        rect.set_linewidth(0.5)


# Invert y on the left axis so both share the same inverted order
axes[0].invert_yaxis()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_1.png", dpi=300, bbox_inches='tight')
plt.show()



# --- 3. F1-Score Comparison Bar Chart (Sorted by Average, gpt-5 excluded) ---
print("\n## Figure: F1-Score Comparison (Sorted by Average) ##")
plot_df_f1 = accuracy_summary_df.drop(index=GOLDEN_DATASET, errors='ignore')
plot_df_f1_sorted = plot_df_f1.sort_values('Avg F1 Score', ascending=False)

plot_df_f1_sorted[['Maint. Type F1', 'Issue Cat. F1']].plot(kind='bar', figsize=(14, 7))
plt.title(f"F1-Score for Alignment with {ground_truth_source} (Sorted by Average)")
plt.ylabel("Weighted F1-Score")
plt.xlabel("Model")
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_2.png", dpi=300, bbox_inches='tight')
plt.show()



# --- 4. Master Chart for F1-Score vs. Confidence ---
print("\n## Figure: Model Calibration: Average F1-Score vs. Confidence ##")

# Recompute per-confidence F1 specifically for this plot, INCLUDING the GOLDEN_DATASET
conf_rows = []
for model in model_names:
    for level in ['High', 'Medium', 'Low']:
        col_conf = f"{model}_Certainty"
        if col_conf not in master_df.columns:
            continue
        subset_df = master_df[master_df[col_conf] == level]
        if subset_df.empty:
            continue

        df_maint = subset_df.dropna(subset=['Golden_Maint_Type', f"{model}_Maint_Type"])
        df_issue = subset_df.dropna(subset=['Golden_Issue_Cat', f"{model}_Issue_Cat"])
        if df_maint.empty or df_issue.empty:
            continue

        f1_maint = f1_score(df_maint['Golden_Maint_Type'], df_maint[f"{model}_Maint_Type"], average='weighted', zero_division=0)
        f1_issue = f1_score(df_issue['Golden_Issue_Cat'], df_issue[f"{model}_Issue_Cat"], average='weighted', zero_division=0)
        conf_rows.append({
            "Model": model,
            "Confidence": level,
            "Avg F1 Score": float(np.mean([f1_maint, f1_issue])),
        })

conf_acc_df_plot = pd.DataFrame(conf_rows)

if not conf_acc_df_plot.empty:
    conf_pivot_avg = conf_acc_df_plot.pivot(index='Model', columns='Confidence', values='Avg F1 Score')

    # Ensure desired column order
    desired_order = ['High', 'Medium', 'Low']
    conf_pivot_avg = conf_pivot_avg.reindex(columns=[c for c in desired_order if c in conf_pivot_avg.columns])

    # Sort by 'High' if available, else by the first available column
    sort_col = 'High' if 'High' in conf_pivot_avg.columns else conf_pivot_avg.columns[0]
    conf_pivot_avg = conf_pivot_avg.sort_values(sort_col, ascending=False)

    # Explicit colors per confidence
    confidence_colors = {
        'High':   '#1f77b4',  # blue
        'Medium': '#2ca02c',  # green
        'Low':    '#ff7f0e',  # orange
    }

    ax = conf_pivot_avg.plot(
        kind='bar', figsize=(14, 7),
        color=[confidence_colors[c] for c in conf_pivot_avg.columns]
    )

    # Dim only the GOLDEN_DATASET bars (now present)
    if GOLDEN_DATASET in conf_pivot_avg.index:
        gpt5_index = conf_pivot_avg.index.get_loc(GOLDEN_DATASET)
        for container in ax.containers:  # one BarContainer per confidence level
            if gpt5_index < len(container.patches):
                container.patches[gpt5_index].set_alpha(0.4)

    # Legend with full-opacity proxy patches using the same mapping
    from matplotlib.patches import Patch
    legend_handles = [
        Patch(facecolor=confidence_colors[label], label=label, alpha=1.0)
        for label in conf_pivot_avg.columns
    ]
    ax.legend(handles=legend_handles, title="Confidence Level")

    plt.title(f"Model Calibration: Average F1-Score vs. Confidence")
    plt.ylabel("Average Weighted F1-Score")
    plt.xlabel("Model")
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1.05)
    plt.tight_layout()
    plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_3.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("INFO: Confidence vs. F1-Score data not available to plot.")



# --- 5. Confusion Matrix (Top Model, No Labels, Bigger Squares) ---
print("\n## Figure: Confusion Matrix Pattern (Top Model, Issue Category) ##")
if not accuracy_summary_df.drop(index=GOLDEN_DATASET, errors='ignore').empty:
    top_model = accuracy_summary_df.drop(index=GOLDEN_DATASET, errors='ignore').sort_values('Issue Cat. F1', ascending=False).index[0]
    
    plt.figure(figsize=(14, 12))
    temp_df_issue = master_df[['Golden_Issue_Cat', f"{top_model}_Issue_Cat"]].dropna()
    issue_labels = pd.unique(temp_df_issue[['Golden_Issue_Cat', f"{top_model}_Issue_Cat"]].values.ravel('K'))
    cm_issue = confusion_matrix(temp_df_issue['Golden_Issue_Cat'], temp_df_issue[f"{top_model}_Issue_Cat"], labels=issue_labels)
    sns.heatmap(cm_issue, annot=True, fmt='d', cmap='Reds', xticklabels=False, yticklabels=False, annot_kws={"size": 8})
    plt.title(f"Confusion Matrix Pattern for Issue Category Labels Generated by '{top_model}'")
    plt.ylabel("Refernce Label")
    plt.xlabel("Generated Label")
    plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_4.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("INFO: No accuracy data available to generate confusion matrix.")


# --- 6. Averaged Kappa Heatmap ---
print("\n## Figure: Averaged Inter-Model Agreement Heatmap ##")
if 'kappa_matrix' in locals() and 'kappa_matrix_issue' in locals():
    kappa_matrix_avg = (kappa_matrix + kappa_matrix_issue) / 2
    
    kappa_matrix_sorted = kappa_matrix_avg.reindex(index=PAPER_MODEL_ORDER, columns=PAPER_MODEL_ORDER).dropna(how='all', axis=0).dropna(how='all', axis=1)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(kappa_matrix_sorted, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title("Average Pairwise Inter-Model Agreement (Cohen's Kappa)")
    plt.savefig(ANALYSIS_OUTPUT_DIR / "figure_5.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("INFO: Kappa matrices not available to generate averaged heatmap.")