In [5]:
import logging
import json
import colorlog
import matplotlib.pyplot as plt
import seaborn as sns

# Define a color formatter
formatter = colorlog.ColoredFormatter(
    "%(log_color)s%(levelname)s: %(message)s",
    log_colors={
        "DEBUG": "cyan",
        "INFO": "green",
        "WARNING": "yellow",
        "ERROR": "red",
        "CRITICAL": "bold_red",
    },
)

# Create a logger
logger = logging.getLogger(__name__)  # Use module-level logger
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent propagation to root logger

# Remove all existing handlers (even from previous executions)
logger.handlers.clear()

# Create a stream handler and set the formatter
handler = logging.StreamHandler()
handler.setFormatter(formatter)

# Add the new handler
logger.addHandler(handler)

In [6]:
monitored_metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]
data_types = ["ast_cfg", "ast", "cfg"]
base_folder = "../explanations"
columns = ["Data Type"] + monitored_metrics
sns.set_palette("tab10")

In [7]:
def get_json_files(directory):
    """Retrieves all JSON file paths in a given directory."""
    try:
        return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".json")]
    except FileNotFoundError:
        logger.warning(f"Directory not found: {directory}")
        return []


def load_json(file_path):
    """Loads a JSON file and returns its content."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return json.load(file)
    except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
        logger.error(f"Error processing file '{file_path}': {e}")
        return None


def compute_category_metrics(directory, gt_category):
    """
    Computes Accuracy, Precision, Recall, and F1-score for a category using sklearn.

    :param directory: Path to the directory containing JSON log files.
    :param gt_category: The ground truth category.
    :return: Dictionary containing accuracy, precision, recall, and f1-score.
    """
    json_files = get_json_files(directory)
    total_files = len(json_files)

    if total_files == 0:
        return {"Accuracy": 0.0, "Precision": 0.0, "Recall": 0.0, "F1 Score": 0.0, "Total": 0}

    gt_category = gt_category.lower()
    y_true, y_pred = [], []

    for file in json_files:
        content = load_json(file)
        if not content:
            continue

        predicted_category = content.get("classification", "").lower()
        y_true.append(gt_category)
        y_pred.append(predicted_category)

    # Compute metrics using sklearn
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="binary", pos_label=gt_category, zero_division=0)
    recall = recall_score(y_true, y_pred, average="binary", pos_label=gt_category, zero_division=0)
    f1 = f1_score(y_true, y_pred, average="binary", pos_label=gt_category, zero_division=0)

    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1, "Total": total_files}


def compute_overall_metrics(results):
    """
    Computes overall Accuracy, Precision, Recall, and F1-score across multiple categories.

    :param results: Dictionary of category-wise metric results.
    :return: Dictionary containing overall accuracy, precision, recall, and f1-score.
    """
    total_all = sum(res["Total"] for res in results.values())

    if total_all == 0:
        logger.warning("No JSON files found in any category.")
        return {"Accuracy": 0.0, "Precision": 0.0, "Recall": 0.0, "F1 Score": 0.0}

    # Weighted average of all metrics
    weighted_metrics = {key: sum(res[key] * res["Total"] for res in results.values()) / total_all for key in
                        monitored_metrics}

    return weighted_metrics

# Baseline

In [8]:
import os
import numpy as np
import pandas as pd
import logging
from collections import defaultdict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    # classification_report # Could be useful for verbose printing per fold
)

# --- Logger Setup (Basic) ---
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)


# --- Utility Functions (get_json_files, load_json remain the same) ---
def get_json_files(directory):
    """Retrieves all JSON file paths in a given directory."""
    try:
        return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".json")]
    except FileNotFoundError:
        logger.warning(f"Directory not found: {directory}")
        return []


def load_json(file_path):
    """Loads a JSON file and returns its content."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return json.load(file)
    except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
        logger.error(f"Error processing file '{file_path}': {e}")
        return None


# --- Main Script ---
base_folder = "../explanations/prompting/baseline"  # Make sure this path is correct
num_folds = 3

# --- 1. Identify Models and Categories ---
fold_dirs = [os.path.join(base_folder, f"cv_{i + 1}") for i in range(num_folds)]
first_fold_dir = fold_dirs[0]

if not os.path.exists(first_fold_dir):
    logger.error(f"First fold directory not found: {first_fold_dir}")
    exit()

available_models = [
    model_dir for model_dir in os.listdir(first_fold_dir)
    if os.path.isdir(os.path.join(first_fold_dir, model_dir))
]

if not available_models:
    logger.info(f"No model directories found in {first_fold_dir}")
    exit()

logger.info(f"Found models: {available_models}")

baseline_results = {}  # To store aggregated mean/std dev results

# Define overall metrics to be tracked and averaged
overall_metric_keys = [
    "Accuracy",
    "Precision_weighted",
    "Recall_weighted",
    "F1 Score_weighted"
]

for model_name in available_models:
    logger.info(f"\n===== PROCESSING MODEL: {model_name} =====")

    model_fold_metrics = defaultdict(lambda: defaultdict(list))
    all_true_categories_for_model = set()  # To collect all unique true category names like {'reentrant', 'safe'}

    for i, fold_dir_path in enumerate(fold_dirs):
        fold_num = i + 1
        model_path_in_fold = os.path.join(fold_dir_path, model_name)
        logger.info(f"\n  --- Processing Fold {fold_num} ({model_path_in_fold}) ---")

        if not os.path.exists(model_path_in_fold):
            logger.warning(f"  Model directory not found in this fold: {model_path_in_fold}")
            continue

        try:
            categories_in_fold_model_dir = {
                subdir: os.path.join(model_path_in_fold, subdir)
                for subdir in os.listdir(model_path_in_fold)
                if os.path.isdir(os.path.join(model_path_in_fold, subdir))
            }
        except FileNotFoundError:
            logger.error(f"  Error listing categories in: {model_path_in_fold}")
            continue

        if not categories_in_fold_model_dir:
            logger.warning(f"  No category subdirectories found in {model_path_in_fold}")
            continue

        y_true_for_fold = []
        y_pred_for_fold = []
        current_fold_true_categories = set()  # True categories present in this specific fold

        for true_category_name, category_path in categories_in_fold_model_dir.items():
            true_category_lower = true_category_name.lower()

            # Basic check for expected binary labels from directory names
            if true_category_lower not in ["reentrant", "safe"]:
                logger.warning(
                    f"    Unexpected category directory name found: {true_category_name}. Expected 'reentrant' or 'safe'. Skipping this directory.")
                continue

            current_fold_true_categories.add(true_category_lower)
            all_true_categories_for_model.add(true_category_lower)

            json_files = get_json_files(category_path)
            if not json_files:
                logger.warning(f"    No JSON files found in {category_path} for category '{true_category_name}'")
                continue

            for file_path in json_files:
                content = load_json(file_path)
                if not content:
                    continue

                predicted_category = content.get("classification", "").lower()
                if not predicted_category:
                    predicted_category = "unknown_prediction"  # Or some other placeholder

                y_true_for_fold.append(true_category_lower)
                y_pred_for_fold.append(predicted_category)

        if not y_true_for_fold:
            logger.warning(
                f"  No data collected for model {model_name} in fold {fold_num}. Metrics for this fold will be NaN.")
            continue

        # --- Calculate metrics for the current fold ---
        unique_labels_in_fold = sorted(list(set(y_true_for_fold) | set(y_pred_for_fold)))

        # Overall metrics for this fold
        fold_overall_metrics = {}
        fold_overall_metrics["Accuracy"] = accuracy_score(y_true_for_fold, y_pred_for_fold)

        fold_overall_metrics["Precision_weighted"] = precision_score(
            y_true_for_fold, y_pred_for_fold,
            labels=unique_labels_in_fold, average='weighted', zero_division=0
        )
        fold_overall_metrics["Recall_weighted"] = recall_score(
            y_true_for_fold, y_pred_for_fold,
            labels=unique_labels_in_fold, average='weighted', zero_division=0
        )
        fold_overall_metrics["F1 Score_weighted"] = f1_score(
            y_true_for_fold, y_pred_for_fold,
            labels=unique_labels_in_fold, average='weighted', zero_division=0
        )

        logger.info(f"  Fold {fold_num} Overall - "
                    f"Acc: {fold_overall_metrics['Accuracy']:.2%}, "
                    f"Prec (weighted): {fold_overall_metrics['Precision_weighted']:.2%}, "
                    f"Rec (weighted): {fold_overall_metrics['Recall_weighted']:.2%}, "
                    f"F1 (weighted): {fold_overall_metrics['F1 Score_weighted']:.2%}")

        for key, value in fold_overall_metrics.items():
            model_fold_metrics['overall'][key].append(value)

        # Per-category metrics for this fold (for both "reentrant" and "safe")
        fold_per_category_metrics = {}
        sorted_true_categories_in_fold = sorted(list(current_fold_true_categories))

        for cat_name in sorted_true_categories_in_fold:  # Should be 'reentrant' and/or 'safe'
            if cat_name not in unique_labels_in_fold:
                prec, rec, f1, support = np.nan, np.nan, np.nan, 0
                logger.warning(
                    f"    Category '{cat_name}' (true label) not in unique_labels_in_fold for fold {fold_num}. Setting its metrics to NaN/0.")
            else:
                prec = precision_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold,
                                       pos_label=cat_name, average='binary', zero_division=0)
                rec = recall_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, pos_label=cat_name,
                                   average='binary', zero_division=0)
                f1 = f1_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, pos_label=cat_name,
                              average='binary', zero_division=0)
                support = y_true_for_fold.count(cat_name)

            fold_per_category_metrics[cat_name] = {"Precision": prec, "Recall": rec, "F1 Score": f1, "Support": support}
            logger.info(f"    Cat '{cat_name}': P: {prec:.2%}, R: {rec:.2%}, F1: {f1:.2%}, Sup: {support}")

            model_fold_metrics[cat_name]["Precision"].append(prec)
            model_fold_metrics[cat_name]["Recall"].append(rec)
            model_fold_metrics[cat_name]["F1 Score"].append(f1)
            model_fold_metrics[cat_name]["Support"].append(support)

    # --- 3. Calculate Mean and Std Dev for the current model ---
    expected_categories_for_model = sorted(list(all_true_categories_for_model))

    for key_to_summarize in expected_categories_for_model + ['overall']:
        metric_names_to_check = []
        if key_to_summarize == 'overall':
            metric_names_to_check = overall_metric_keys
        else:  # per-category
            metric_names_to_check = ["Precision", "Recall", "F1 Score", "Support"]

        for metric_name in metric_names_to_check:
            current_values = model_fold_metrics[key_to_summarize].get(metric_name, [])
            while len(current_values) < num_folds:
                current_values.append(np.nan if metric_name != "Support" else 0)
            model_fold_metrics[key_to_summarize][metric_name] = current_values[:num_folds]

    logger.info(f"\n  --- Aggregated Results for Model: {model_name} (across {num_folds} folds) ---")
    model_summary = {}

    category_keys_to_process = expected_categories_for_model + ['overall']

    for key_to_summarize in category_keys_to_process:
        if not model_fold_metrics[key_to_summarize]:
            logger.info(f"    No data to aggregate for '{key_to_summarize.capitalize()}'.")
            continue

        summary_metrics = {}
        logger.info(f"    {key_to_summarize.capitalize()}:")

        metrics_to_aggregate_names = overall_metric_keys if key_to_summarize == 'overall' else ["Precision", "Recall",
                                                                                                "F1 Score", "Support"]

        for metric_name in metrics_to_aggregate_names:
            values = model_fold_metrics[key_to_summarize].get(metric_name, [np.nan] * num_folds)

            valid_values = [v for v in values if not np.isnan(v)]
            num_processed_folds = len(values)
            num_valid_folds = len(valid_values)

            if num_valid_folds > 0:
                mean_val = np.mean(valid_values)
                std_dev = np.std(valid_values) if num_valid_folds > 1 else 0.0
                summary_metrics[f"{metric_name}_mean"] = mean_val
                summary_metrics[f"{metric_name}_std"] = std_dev
                if metric_name == "Support":
                    logger.info(
                        f"      {metric_name}: {mean_val:.2f} ± {std_dev:.2f} (from {num_valid_folds}/{num_processed_folds} folds)")
                else:
                    logger.info(
                        f"      {metric_name}: {mean_val:.2%} ± {std_dev:.2%} (from {num_valid_folds}/{num_processed_folds} folds)")
            else:
                summary_metrics[f"{metric_name}_mean"] = np.nan
                summary_metrics[f"{metric_name}_std"] = np.nan
                logger.info(f"      {metric_name}: N/A (No valid data across {num_processed_folds} folds)")

        model_summary[key_to_summarize] = summary_metrics
    baseline_results[model_name] = model_summary

# --- 4. Print Final Summary Table & Save CSV ---
logger.info("\n\n===== FINAL CROSS-VALIDATION SUMMARY (Overall Weighted Metrics & Accuracy) =====")
final_summary_for_csv = {}

for model_name, summary_data in baseline_results.items():
    logger.info(f"\n--- Model: {model_name} ---")
    if 'overall' in summary_data and summary_data['overall']:
        overall_metrics_summary = summary_data['overall']  # Renamed for clarity
        final_summary_for_csv[model_name] = overall_metrics_summary
        logger.info(f"  Overall Performance (Accuracy & Weighted Averages):")
        logger.info(
            f"    Accuracy:  {overall_metrics_summary.get('Accuracy_mean', np.nan):.2%} ± {overall_metrics_summary.get('Accuracy_std', np.nan):.2%}")
        logger.info(
            f"    Precision (weighted): {overall_metrics_summary.get('Precision_weighted_mean', np.nan):.2%} ± {overall_metrics_summary.get('Precision_weighted_std', np.nan):.2%}")
        logger.info(
            f"    Recall (weighted):    {overall_metrics_summary.get('Recall_weighted_mean', np.nan):.2%} ± {overall_metrics_summary.get('Recall_weighted_std', np.nan):.2%}")
        logger.info(
            f"    F1 Score (weighted):  {overall_metrics_summary.get('F1 Score_weighted_mean', np.nan):.2%} ± {overall_metrics_summary.get('F1 Score_weighted_std', np.nan):.2%}")
    else:
        logger.info(f"  No overall metrics summary available for {model_name}.")

if final_summary_for_csv:
    df_summary = pd.DataFrame.from_dict(final_summary_for_csv, orient='index')

    csv_columns_ordered = []
    for metric_base_name in overall_metric_keys:
        csv_columns_ordered.append(f"{metric_base_name}_mean")
        csv_columns_ordered.append(f"{metric_base_name}_std")

    df_summary = df_summary.reindex(columns=csv_columns_ordered)

    try:
        output_csv_name = "baseline_classification_weighted_metrics.csv"
        df_summary.to_csv(output_csv_name)
        logger.info(f"\nSaved {output_csv_name}")
    except Exception as e:
        logger.error(f"Error saving CSV: {e}")
else:
    logger.info("\nNo data to save to CSV.")

logger.info("\nProcessing Complete.")

[32mINFO: Found models: ['gpt-4.1-nano', 'o4-mini', 'gpt-4.1', 'gpt-4o', 'o3-mini', 'gpt-4.1-mini', 'gemini-1.5-flash', 'gemini-2.5-flash-preview-05-20', 'gemini-2.0-flash'][0m
[32mINFO: 
===== PROCESSING MODEL: gpt-4.1-nano =====[0m
[32mINFO: 
  --- Processing Fold 1 (../explanations/prompting/baseline/cv_1/gpt-4.1-nano) ---[0m
[32mINFO:   Fold 1 Overall - Acc: 71.92%, Prec (weighted): 51.72%, Rec (weighted): 71.92%, F1 (weighted): 60.17%[0m
[32mINFO:     Cat 'reentrant': P: 0.00%, R: 0.00%, F1: 0.00%, Sup: 41[0m
[32mINFO:     Cat 'safe': P: 71.92%, R: 100.00%, F1: 83.67%, Sup: 105[0m
[32mINFO: 
  --- Processing Fold 2 (../explanations/prompting/baseline/cv_2/gpt-4.1-nano) ---[0m
[32mINFO:   Fold 2 Overall - Acc: 71.92%, Prec (weighted): 51.72%, Rec (weighted): 71.92%, F1 (weighted): 60.17%[0m
[32mINFO:     Cat 'reentrant': P: 0.00%, R: 0.00%, F1: 0.00%, Sup: 41[0m
[32mINFO:     Cat 'safe': P: 71.92%, R: 100.00%, F1: 83.67%, Sup: 105[0m
[32mINFO: 
  --- Processing

# XRAG Models Evaluation

In [9]:
import os
import json
import numpy as np
import logging
from collections import defaultdict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# --- Logger Setup ---
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)


# --- Utility Functions ---
def get_json_files(category_directory):
    """
    Retrieves all 'classification.json' file paths within contract address folders.
    Path structure: .../[CATEGORY]/[CONTRACT ADDRESS]/classification.json
    """
    json_files = []
    if not os.path.isdir(category_directory):
        # This is expected if a category doesn't exist in a fold, so not a warning.
        return []

    # Iterate through contract address folders
    for contract_addr in os.listdir(category_directory):
        contract_path = os.path.join(category_directory, contract_addr)
        if os.path.isdir(contract_path):
            json_file_path = os.path.join(contract_path, "classification.json")
            if os.path.isfile(json_file_path):
                json_files.append(json_file_path)
    return json_files


def load_json(file_path):
    """Loads a JSON file and returns its content."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return json.load(file)
    except (json.JSONDecodeError, FileNotFoundError, KeyError) as e:
        logger.error(f"Error processing file '{file_path}': {e}")
        return None


# --- Main Script ---
# CONFIGURATION
# ==============================================================================
base_folder = "../explanations/prompting/rag"  # Make sure this path is correct
num_folds = 3
# ==============================================================================

# --- 1. Identify Models and Data Types from the first fold ---
first_fold_dir = os.path.join(base_folder, "cv_1")
if not os.path.isdir(first_fold_dir):
    logger.error(f"First fold directory not found: {first_fold_dir}")
    exit()

# Get available models from the first fold
available_models = [d for d in os.listdir(first_fold_dir) if os.path.isdir(os.path.join(first_fold_dir, d))]
if not available_models:
    logger.error(f"No model directories found in {first_fold_dir}")
    exit()
logger.info(f"Found models: {available_models}")

# Get available data types from the first model in the first fold
first_model_path = os.path.join(first_fold_dir, available_models[0])
available_data_types = [d for d in os.listdir(first_model_path) if os.path.isdir(os.path.join(first_model_path, d))]
if not available_data_types:
    logger.error(f"No data type directories found in {first_model_path}")
    exit()
logger.info(f"Found data types: {available_data_types}")

# --- 2. Process each model and data type across all folds ---
all_results = {}
fold_dirs = [os.path.join(base_folder, f"cv_{i + 1}") for i in range(num_folds)]

# Define overall metrics to be tracked and averaged
overall_metric_keys = ["Accuracy", "Precision_weighted", "Recall_weighted", "F1 Score_weighted"]
category_metric_keys = ["Precision", "Recall", "F1 Score", "Support"]

for model_name in available_models:
    for data_type in available_data_types:
        logger.info(f"\n===== PROCESSING MODEL: {model_name} | DATA TYPE: {data_type} =====")

        # Stores metrics for each fold for the current model/data_type
        # e.g., model_fold_metrics['overall']['Accuracy'] = [0.9, 0.92, 0.88]
        model_fold_metrics = defaultdict(lambda: defaultdict(list))

        all_true_categories_for_model = set()

        for i, fold_dir_path in enumerate(fold_dirs):
            fold_num = i + 1
            # Construct the path for the specific model and data type within the current fold
            data_type_path_in_fold = os.path.join(fold_dir_path, model_name, data_type)

            logger.info(f"\n  --- Processing Fold {fold_num} ({data_type_path_in_fold}) ---")

            if not os.path.isdir(data_type_path_in_fold):
                logger.warning(f"  Directory not found in this fold, skipping: {data_type_path_in_fold}")
                # Append NaN for all metrics for this fold to keep array lengths consistent
                for key in overall_metric_keys:
                    model_fold_metrics['overall'][key].append(np.nan)
                for cat in ["reentrant", "safe"]:
                    for key in category_metric_keys:
                        model_fold_metrics[cat][key].append(np.nan if key != "Support" else 0)
                continue

            y_true_for_fold, y_pred_for_fold = [], []
            current_fold_true_categories = set()

            # The category directories ('reentrant', 'safe') are inside the data_type path
            category_names = [d for d in os.listdir(data_type_path_in_fold) if
                              os.path.isdir(os.path.join(data_type_path_in_fold, d))]

            for true_category_name in category_names:
                true_category_lower = true_category_name.lower()

                if true_category_lower not in ["reentrant", "safe"]:
                    logger.warning(f"    Unexpected category directory '{true_category_name}', skipping.")
                    continue

                current_fold_true_categories.add(true_category_lower)
                all_true_categories_for_model.add(true_category_lower)

                category_path = os.path.join(data_type_path_in_fold, true_category_name)
                json_files = get_json_files(category_path)

                for file_path in json_files:
                    content = load_json(file_path)
                    if not content: continue

                    predicted_category = content.get("classification", "unknown").lower()
                    if not predicted_category:
                        predicted_category = "reentrant"

                    y_true_for_fold.append(true_category_lower)
                    y_pred_for_fold.append(predicted_category)

            if not y_true_for_fold:
                logger.warning(f"  No data collected in fold {fold_num}. Metrics for this fold will be NaN.")
                continue

            # --- Calculate metrics for the current fold ---
            unique_labels_in_fold = sorted(list(set(y_true_for_fold) | set(y_pred_for_fold)))

            # Overall metrics
            acc = accuracy_score(y_true_for_fold, y_pred_for_fold)
            prec_w = precision_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, average='weighted',
                                     zero_division=0)
            rec_w = recall_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, average='weighted',
                                 zero_division=0)
            f1_w = f1_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, average='weighted',
                            zero_division=0)

            logger.info(
                f"  Fold {fold_num} Overall - Acc: {acc:.2%}, Prec: {prec_w:.2%}, Rec: {rec_w:.2%}, F1: {f1_w:.2%}")

            model_fold_metrics['overall']["Accuracy"].append(acc)
            model_fold_metrics['overall']["Precision_weighted"].append(prec_w)
            model_fold_metrics['overall']["Recall_weighted"].append(rec_w)
            model_fold_metrics['overall']["F1 Score_weighted"].append(f1_w)

            # Per-category metrics
            for cat_name in sorted(list(current_fold_true_categories)):
                prec = precision_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold,
                                       pos_label=cat_name, average='binary', zero_division=0)
                rec = recall_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, pos_label=cat_name,
                                   average='binary', zero_division=0)
                f1 = f1_score(y_true_for_fold, y_pred_for_fold, labels=unique_labels_in_fold, pos_label=cat_name,
                              average='binary', zero_division=0)
                support = y_true_for_fold.count(cat_name)

                logger.info(f"    Cat '{cat_name}': P: {prec:.2%}, R: {rec:.2%}, F1: {f1:.2%}, Sup: {support}")

                model_fold_metrics[cat_name]["Precision"].append(prec)
                model_fold_metrics[cat_name]["Recall"].append(rec)
                model_fold_metrics[cat_name]["F1 Score"].append(f1)
                model_fold_metrics[cat_name]["Support"].append(support)

        # --- 3. Aggregate results for the current model and data type ---
        logger.info(f"\n  --- Aggregated Results for {model_name} on {data_type} (across {num_folds} folds) ---")
        model_summary = {}

        # Ensure all expected categories ('safe', 'reentrant') are processed, even if absent in some folds
        expected_categories_for_model = sorted(list(all_true_categories_for_model | {"safe", "reentrant"}))

        for key_to_summarize in ['overall'] + expected_categories_for_model:
            summary_metrics = {}
            is_overall = (key_to_summarize == 'overall')

            metric_keys = overall_metric_keys if is_overall else category_metric_keys
            logger.info(f"    {key_to_summarize.capitalize()}:")

            for metric_name in metric_keys:
                values = model_fold_metrics[key_to_summarize].get(metric_name, [])

                # Pad with NaN/0 if a fold was skipped or a category was missing
                while len(values) < num_folds:
                    values.append(np.nan if metric_name != "Support" else 0)

                valid_values = [v for v in values if not np.isnan(v)]
                num_valid_folds = len(valid_values)

                if num_valid_folds > 0:
                    mean_val = np.mean(valid_values)
                    std_dev = np.std(valid_values) if num_valid_folds > 1 else 0.0
                    summary_metrics[f"{metric_name}_mean"] = mean_val
                    summary_metrics[f"{metric_name}_std"] = std_dev

                    log_msg = f"      {metric_name}: {mean_val:.2%}"
                    if metric_name != "Support":
                        log_msg += f" ± {std_dev:.2%}"
                    else:  # Support doesn't need percentage
                        log_msg = f"      {metric_name}: {mean_val:.2f} ± {std_dev:.2f}"

                    logger.info(f"{log_msg} (from {num_valid_folds}/{num_folds} folds)")
                else:
                    summary_metrics[f"{metric_name}_mean"] = np.nan
                    summary_metrics[f"{metric_name}_std"] = np.nan
                    logger.info(f"      {metric_name}: N/A (No valid data across folds)")

            model_summary[key_to_summarize] = summary_metrics

        # Store the complete summary for this model-datatype combo
        all_results[f"{model_name}__{data_type}"] = model_summary

# --- 4. Print Final Summary Table ---
logger.info("\n\n" + "=" * 80)
logger.info("FINAL CROSS-VALIDATION SUMMARY (Overall Weighted Metrics & Accuracy)")
logger.info("=" * 80)

for result_key, summary_data in all_results.items():
    logger.info(f"\n--- Results for: {result_key} ---")

    if 'overall' in summary_data and summary_data['overall']:
        overall_metrics = summary_data['overall']
        logger.info("  Overall Performance (Accuracy & Weighted Averages):")
        logger.info(
            f"    Accuracy:             {overall_metrics.get('Accuracy_mean', np.nan):.2%} ± {overall_metrics.get('Accuracy_std', np.nan):.2%}")
        logger.info(
            f"    Precision (weighted): {overall_metrics.get('Precision_weighted_mean', np.nan):.2%} ± {overall_metrics.get('Precision_weighted_std', np.nan):.2%}")
        logger.info(
            f"    Recall (weighted):    {overall_metrics.get('Recall_weighted_mean', np.nan):.2%} ± {overall_metrics.get('Recall_weighted_std', np.nan):.2%}")
        logger.info(
            f"    F1 Score (weighted):  {overall_metrics.get('F1 Score_weighted_mean', np.nan):.2%} ± {overall_metrics.get('F1 Score_weighted_std', np.nan):.2%}")
    else:
        logger.info(f"  No overall metrics summary available for {result_key}.")

logger.info("\n\nProcessing Complete.")

[32mINFO: Found models: ['gpt-4.1-nano', 'o4-mini', 'gpt-4.1', 'gpt-4o', 'o3-mini', 'gpt-4.1-mini', 'gemini-2.5-flash-preview-05-20'][0m
[32mINFO: Found data types: ['ast_cfg', 'cfg', 'ast'][0m
[32mINFO: 
===== PROCESSING MODEL: gpt-4.1-nano | DATA TYPE: ast_cfg =====[0m
[32mINFO: 
  --- Processing Fold 1 (../explanations/prompting/rag/cv_1/gpt-4.1-nano/ast_cfg) ---[0m
[32mINFO:   Fold 1 Overall - Acc: 73.29%, Prec: 73.73%, Rec: 73.29%, F1: 64.31%[0m
[32mINFO:     Cat 'reentrant': P: 75.00%, R: 7.32%, F1: 13.33%, Sup: 41[0m
[32mINFO:     Cat 'safe': P: 73.24%, R: 99.05%, F1: 84.21%, Sup: 105[0m
[32mINFO: 
  --- Processing Fold 2 (../explanations/prompting/rag/cv_2/gpt-4.1-nano/ast_cfg) ---[0m
[32mINFO:   Fold 2 Overall - Acc: 74.66%, Prec: 81.26%, Rec: 74.66%, F1: 66.14%[0m
[32mINFO:     Cat 'reentrant': P: 100.00%, R: 9.76%, F1: 17.78%, Sup: 41[0m
[32mINFO:     Cat 'safe': P: 73.94%, R: 100.00%, F1: 85.02%, Sup: 105[0m
[32mINFO: 
  --- Processing Fold 3 (../expl

# Effect of Data Type

In [10]:
# Check if the results dictionary exists and is populated
if 'final_classification_results' not in locals() or not isinstance(final_classification_results,
                                                                    dict) or not final_classification_results:
    logger.error("Error: 'final_classification_results' dictionary not found, is not a dictionary, or is empty.")
# Check if essential configuration lists exist
elif 'data_types' not in locals() or not data_types:
    logger.error("Error: 'data_types' list not found or is empty.")
elif 'monitored_metrics' not in locals() or not monitored_metrics:
    logger.error("Error: 'monitored_metrics' list not found or is empty.")
else:
    # Filter models based on the original criteria, now using the keys from the new dictionary
    models = [
        m for m in final_classification_results.keys()
        if not ("_k" in m or m.startswith("r"))  # Adjust filter if needed
    ]

    # Proceed only if there are models left after filtering
    if not models:
        logger.warning("No models match the filter criteria. No plots will be generated.")
    else:
        logger.info(f"Plotting for models: {models}")
        logger.info(f"Using data types: {data_types}")
        logger.info(f"Plotting metrics: {monitored_metrics}")

        # Check the exact name of the DataType column in the first valid model's DataFrame
        first_df = final_classification_results[models[0]]
        if "DataType" in first_df.columns:
            data_type_col_name = "DataType"
        elif "Data Type" in first_df.columns:
            data_type_col_name = "Data Type"
        else:
            logger.error("Could not find 'DataType' or 'Data Type' column in DataFrames. Cannot proceed with plotting.")
            # Set models to empty to prevent further processing
            models = []

    # Loop through each metric to create a separate plot
    for metric in monitored_metrics:
        if not models:  # Skip if no models or if DataType column check failed
            break

        plt.figure(figsize=(10, 6))  # Adjust figure size as needed

        # Define the column names for mean and standard deviation
        mean_col = f"{metric}_mean"
        std_col = f"{metric}_std"

        # Basic check if mean/std columns exist in the first model's DataFrame
        first_model_df = final_classification_results[models[0]]
        if mean_col not in first_model_df.columns or std_col not in first_model_df.columns:
            logger.warning(
                f"Metric columns '{mean_col}' or '{std_col}' not found in DataFrame for model '{models[0]}'. Skipping plot for metric '{metric}'.")
            plt.close()  # Close the figure created for this metric
            continue  # Skip to the next metric

        x = np.arange(len(models))  # Create base positions for the models on the x-axis
        num_data_types = len(data_types)
        total_width_per_model = 0.8  # Total width for all bars for one model
        width_per_bar = total_width_per_model / num_data_types  # Width of a single bar

        # Get distinct colors for each data type
        colors = sns.color_palette("viridis", num_data_types)

        # Iterate through each data type to plot its bars for all models
        for i, (data_type, color) in enumerate(zip(data_types, colors)):
            mean_values = []
            std_devs = []

            # Gather mean and std dev for the current data_type across all selected models
            for model in models:
                model_df = final_classification_results[model]
                # Find the row corresponding to the current data type
                row = model_df.loc[model_df[data_type_col_name] == data_type]

                if not row.empty:
                    # Safely extract mean and std dev, defaulting to NaN if columns are missing (though checked earlier)
                    mean_val = row.iloc[0].get(mean_col, np.nan)
                    std_val = row.iloc[0].get(std_col, np.nan)
                    mean_values.append(mean_val)
                    # Use 0 for std dev if it's NaN but mean is valid (or handle as needed)
                    std_devs.append(std_val if not np.isnan(std_val) else 0)
                else:
                    # Handle case where the data type row is missing for this model
                    mean_values.append(np.nan)  # Append NaN to skip plotting this bar
                    std_devs.append(np.nan)  # Append NaN for corresponding error bar
                    logger.warning(f"DataType '{data_type}' not found for model '{model}'. Plotting NaN.")

            # Calculate the precise x-position for this data type's bars
            # Start from the left edge of the group, move by bar index, add half bar width
            bar_position = x - (total_width_per_model / 2) + (i * width_per_bar) + (width_per_bar / 2)

            # Plot the bars with error bars representing std dev
            plt.bar(bar_position,
                    mean_values,
                    width_per_bar,
                    yerr=std_devs,  # Use collected standard deviations for error bars
                    label=data_type,
                    alpha=0.8,
                    color=color,
                    capsize=4)  # Add caps to error bars for visibility

        # --- Formatting the plot ---
        plt.ylim([0, 1.05])  # Set Y-axis limits (adjust if metrics aren't 0-1)
        # Set x-ticks to be centered under each group of bars
        plt.xticks(x, models, rotation=45, ha="right")
        plt.title(f"Comparison of {metric} Across Models (Mean ± Std Dev)")
        plt.xlabel("Model")
        plt.ylabel(f"{metric} (Mean)")
        # Place legend outside the plot area to avoid overlap
        plt.legend(title="Data Type", loc='upper left', bbox_to_anchor=(1.02, 1))
        plt.grid(axis="y", linestyle="--", alpha=0.6)  # Add horizontal grid lines
        # Adjust layout to make space for the legend
        plt.tight_layout(rect=[0, 0, 0.85, 1])  # rect=[left, bottom, right, top]
        plt.show()  # Display the plot for the current metric

[31mERROR: Error: 'final_classification_results' dictionary not found, is not a dictionary, or is empty.[0m


# Models Comparison

In [11]:
# Check if the results dictionaries exist and are populated
if 'final_classification_results' not in locals() or not isinstance(final_classification_results,
                                                                    dict) or not final_classification_results:
    logger.error("Error: 'final_classification_results' dictionary not found, is not a dictionary, or is empty.")
elif 'baseline_results' not in locals() or not isinstance(baseline_results, dict):
    logger.error("Error: 'baseline_results' dictionary not found or is not a dictionary (can be empty).")
# Check if essential configuration lists exist
elif 'data_types' not in locals() or not data_types:
    logger.error("Error: 'data_types' list not found or is empty.")
elif 'monitored_metrics' not in locals() or not monitored_metrics:
    logger.error("Error: 'monitored_metrics' list not found or is empty.")
else:
    # Filter models based on the original criteria
    models = [
        m for m in final_classification_results.keys()
        if not ("_k" in m or m.startswith("r"))  # Adjust filter if needed
    ]

    # Proceed only if there are models left after filtering
    if not models:
        logger.warning("No models match the filter criteria. No plots will be generated.")
    else:
        logger.info(f"Plotting for models: {models}")
        logger.info(f"Using data types: {data_types}")
        logger.info(f"Plotting metrics: {monitored_metrics}")

        # Check the exact name of the DataType column in the first valid model's DataFrame
        first_df = final_classification_results[models[0]]
        if "DataType" in first_df.columns:
            data_type_col_name = "DataType"
        elif "Data Type" in first_df.columns:
            data_type_col_name = "Data Type"
        else:
            logger.error("Could not find 'DataType' or 'Data Type' column in DataFrames. Cannot proceed.")
            models = []  # Prevent further processing

    # Loop through each metric to create a separate plot
    for metric in monitored_metrics:
        if not models:  # Skip if no models or if DataType column check failed
            break

        plt.figure(figsize=(10, 6))  # Adjust figure size as needed

        # Define the column names for mean and standard deviation
        mean_col = f"{metric}_mean"
        std_col = f"{metric}_std"

        # Basic check if mean/std columns exist in the first model's DataFrame (for main results)
        first_model_df = final_classification_results[models[0]]
        if mean_col not in first_model_df.columns or std_col not in first_model_df.columns:
            logger.warning(
                f"Main result columns '{mean_col}' or '{std_col}' not found in DataFrame for model '{models[0]}'. Skipping plot for metric '{metric}'.")
            plt.close()
            continue

        x = np.arange(len(data_types))  # Base positions for the data types on the x-axis
        num_models = len(models)
        total_width_per_group = 0.8  # Total width for all bars for one data type
        width_per_bar = total_width_per_group / num_models  # Width of a single bar

        # Get distinct colors for each model and store them
        model_colors = {model: color for model, color in zip(models, sns.color_palette("viridis", num_models))}

        # --- Plot Baseline Bars First (Faded, Hatched, With Error Bars, No Label) ---
        for i, model in enumerate(models):
            # Get the baseline mean and std dev for this model and metric (assuming 'overall' structure)
            baseline_mean = baseline_results.get(model, {}).get('overall', {}).get(mean_col, np.nan)
            baseline_std = baseline_results.get(model, {}).get('overall', {}).get(std_col, np.nan)  # Get std dev

            if np.isnan(baseline_mean):
                logger.warning(
                    f"Baseline mean for model '{model}', metric '{metric}' not found. Skipping baseline bar.")
                # Skip plotting if mean is NaN
                continue
            # If std dev is NaN but mean is valid, default std dev to 0 for plotting
            if np.isnan(baseline_std):
                logger.warning(
                    f"Baseline std dev for model '{model}', metric '{metric}' not found. Plotting baseline bar with 0 std dev.")
                baseline_std = 0

            # Create lists of baseline values, repeated for each data type
            baseline_means_list = [baseline_mean] * len(data_types)
            baseline_stds_list = [baseline_std] * len(data_types)  # Create list for std dev

            # Calculate the precise x-position for this model's bars
            bar_position = x - (total_width_per_group / 2) + (i * width_per_bar) + (width_per_bar / 2)

            color = model_colors.get(model, "gray")  # Use stored color

            plt.bar(bar_position,
                    baseline_means_list,
                    width_per_bar,
                    yerr=baseline_stds_list,  # Add baseline error bars
                    alpha=0.4,  # Lower alpha for baseline
                    color=color,
                    error_kw=dict(ecolor='gray'),
                    capsize=3,  # Slightly smaller capsize for baseline
                    hatch='//')  # Add hatching for distinction
            # No label here

        # --- Plot Main Bars Second (Solid, With Label and Error Bars) ---
        for i, model in enumerate(models):
            model_df = final_classification_results[model]
            mean_values = []
            std_devs = []

            # Gather mean and std dev for the current model across all data types
            for dt in data_types:
                row = model_df.loc[model_df[data_type_col_name] == dt]
                if not row.empty:
                    mean_val = row.iloc[0].get(mean_col, np.nan)
                    std_val = row.iloc[0].get(std_col, np.nan)
                    mean_values.append(mean_val)
                    # Default std dev to 0 if NaN but mean is valid
                    std_devs.append(std_val if not np.isnan(std_val) else 0)
                else:
                    mean_values.append(np.nan)
                    std_devs.append(np.nan)
                    logger.warning(f"Model '{model}': DataType '{dt}' row not found. Plotting NaN.")

            # Calculate the precise x-position for this model's bars (same as baseline)
            bar_position = x - (total_width_per_group / 2) + (i * width_per_bar) + (width_per_bar / 2)
            color = model_colors.get(model, "gray")  # Use stored color

            # Plot the main bars with error bars
            plt.bar(bar_position,
                    mean_values,
                    width_per_bar,
                    yerr=std_devs,  # Use collected standard deviations
                    label=model,  # Label for the legend
                    alpha=0.9,  # Higher alpha for main bars
                    color=color,  # Use the model's assigned color
                    capsize=5)  # Larger capsize for main error bars

        # --- Formatting the plot ---
        plt.ylim([0, 1.05])  # Set Y-axis limits
        # Set x-ticks to be centered under each group of bars (i.e., at data type positions)
        plt.xticks(x, data_types)
        # Update title to reflect both bars show Mean +/- SD
        plt.title(f"Comparison of {metric} (Mean±SD): Model (Solid) vs Baseline (Hatched/Faded)")
        plt.xlabel("Data Type")
        plt.ylabel(f"{metric} (Mean)")
        # Create legend (model names only)
        plt.legend(title="Model", loc='upper left', bbox_to_anchor=(1.02, 1))
        plt.grid(axis="y", linestyle="--", alpha=0.6)  # Add horizontal grid lines
        # Adjust layout to make space for the legend
        plt.tight_layout(rect=[0, 0, 0.85, 1])  # rect=[left, bottom, right, top]
        plt.show()  # Display the plot for the current metric

[31mERROR: Error: 'final_classification_results' dictionary not found, is not a dictionary, or is empty.[0m
