# MuCoCo RQ3 Experiment Results Aggregation for Model Confidence

This notebook is used to aggregate the results for MuCoCo RQ3 experiments. The results should be stored in MuCoCo_results/MuCoCo_experiment_results/model_output_confidence in the project root folder. The final aggregated results from this notebook are used in tables XI (mpact of varying mode confidence on Inconsistency and Accuracy).

In [None]:
import os
import pandas as pd
import numpy as np
from typing import Tuple

In [None]:
current_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(current_dir, ".."))

In [None]:
def check_confidence(model_folder_dir: str, model_name: str, task: str):
    csv_files = [file for file in os.listdir(model_folder_dir) if file.endswith('.csv')]
    thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]
    benchmarks = ["HumanEval", "CruxEval", "CodeMMLU"]

    for benchmark in benchmarks:
        benchmark_files = [file for file in csv_files if benchmark in file and 'ensemble' not in file]

        for file in benchmark_files:
            file_path = os.path.join(model_folder_dir, file)
            data = pd.read_csv(file_path)

            # Make a copy of the base columns
            benchmark_df = data.loc[:, ['task_id', 'geometric', 'failure_type']].copy()

            # For each threshold, create a new column for classified failure type
            for threshold in thresholds:
                new_col = f'failure_type_{threshold}'
                classified_values = []

                for conf, failure in zip(data['geometric'], data['failure_type']):
                    if isinstance(failure, float) or (isinstance(failure, str) and "AssertionError" in failure and "Mutation" not in failure):
                        if conf >= threshold:
                            classified_values.append(failure)  # correct (no error)
                        else:
                            classified_values.append("Confidence_lower_than_threshold") 
                    else:
                        classified_values.append(failure)

                benchmark_df[new_col] = classified_values

            # Save as new CSV
            out_path = os.path.join(proj_dir, f"MuCoCo_experiment_results/model_output_confidence/{task}/{model_name}")
            os.makedirs(name = out_path, exist_ok=True)
            benchmark_df.to_csv(os.path.join(out_path,f"{file.split('.csv')[0]}_confidence.csv"), index=False)


In [None]:
main_res_dir = os.path.join(proj_dir, "MuCoCo_experiment_results/")


for task in ["input_prediction", "mcq_inconsistency"]:
    target_dir = os.path.join(main_res_dir, task)
    for file in os.listdir(target_dir):
        if any(m.lower() in file.lower() for m in ["Gemma", "Qwen", "Llama"]):
            model_folder_path = os.path.join(target_dir, file)
            check_confidence(model_folder_path, file, task)




In [None]:
def standardize_two_df(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    common_ids = set(df1["task_id"]) & set(df2["task_id"])
    if not common_ids:
        print("⚠️ No matching task_ids found between the two DataFrames.")
        return df1.iloc[0:0], df2.iloc[0:0]  # return empty aligned frames

    df1_filtered = df1[df1["task_id"].isin(common_ids)].copy()
    df2_filtered = df2[df2["task_id"].isin(common_ids)].copy()

    df1_filtered = df1_filtered.drop_duplicates(subset=["task_id"], keep="first")
    df2_filtered = df2_filtered.drop_duplicates(subset=["task_id"], keep="first")

    df1_filtered = df1_filtered.sort_values("task_id").reset_index(drop=True)
    df2_filtered = df2_filtered.sort_values("task_id").reset_index(drop=True)

    return df1_filtered, df2_filtered

In [None]:
conf_dir = os.path.join(main_res_dir, "model_output_confidence")
thresholds = ["", 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]

res_df = pd.DataFrame()
acc_df = pd.DataFrame()
inconsistency_dict = {}
accuracy_dict = {}

def check_valid_failure(failure: str):
    if isinstance(failure, float) or (isinstance(failure, str) and "AssertionError" in failure and "Mutation" not in failure):
        return True
    return False

for task in ['input_prediction', 'mcq_inconsistency']:
    res_dir = os.path.join(conf_dir, task)
    for models in os.listdir(res_dir):
        model_dict = inconsistency_dict.get(models, {})
        model_acc_dict = accuracy_dict.get(models, {})
        if models == ".DS_Store":
            continue
        model_res_path = os.path.join(res_dir, models)
        for benchmark in ["HumanEval", "CodeMMLU", "CruxEval"]:
            csv_logs = [log for log in os.listdir(model_res_path) if benchmark in log and log.endswith('.csv')]
            if len(csv_logs) == 0:
                continue

            no_mut = [log for log in csv_logs if "no_mutation" in log][0]
            no_mut_data = pd.read_csv(os.path.join(model_res_path, no_mut))
            csv_logs.pop(csv_logs.index(no_mut))

            # iterating csv logs through each threshold
            for threshold in thresholds:
                total_comparisons = 0
                total_inconsistencies = 0

                total_answered = 0
                total_correct = 0

                if threshold != "":
                    col_name = f"failure_type_{threshold}"
                else: 
                    col_name = f"failure_type"

                no_mut_failure_type = no_mut_data.loc[:, col_name]

                for no_mut_failure in no_mut_failure_type:
                    if not check_valid_failure(no_mut_failure):
                        continue
                    elif isinstance(no_mut_failure, float):
                        total_correct += 1
                        sub_dict2 = model_acc_dict.get(threshold, {})
                        sub_dict2['correct'] = sub_dict2.get('correct',0) + 1
                        model_acc_dict[threshold] = sub_dict2
                    total_answered += 1
                    sub_dict2 = model_acc_dict.get(threshold, {})
                    sub_dict2['answered'] = sub_dict2.get('answered',0) + 1
                    model_acc_dict[threshold] = sub_dict2
            
                for mut_log in sorted(csv_logs):
                    mut_data = pd.read_csv(os.path.join(model_res_path, mut_log))

                    mut_failure_type = mut_data.loc[:, col_name]
                    count = 0
                    count2 = 0

                    no_mut_temp = no_mut_data.copy()
                    mut_temp = mut_data.copy()

                    no_mut_temp, mut_temp = standardize_two_df(no_mut_temp, mut_temp)


                    for no_mut_failure , mut_failure in zip(no_mut_temp.loc[:, col_name], mut_temp.loc[:, col_name]):

                        if not check_valid_failure(mut_failure) or not check_valid_failure(no_mut_failure):
                            continue 
                        
                        total_answered += 1

                        sub_dict2 = model_acc_dict.get(threshold, {})
                        sub_dict2['answered'] = sub_dict2.get('answered',0) + 1
                        model_acc_dict[threshold] = sub_dict2
                        
                        if isinstance(mut_failure, float):
                            total_correct += 1
                            sub_dict2 = model_acc_dict.get(threshold, {})
                            sub_dict2['correct'] = sub_dict2.get('correct',0) + 1
                            model_acc_dict[threshold] = sub_dict2

                        if not all(check_valid_failure(failure) for failure in [no_mut_failure, mut_failure]) and ("AssertionError" in str(no_mut_failure) and "AssertionError" in str(mut_failure)):
                            continue 

                        if (isinstance(no_mut_failure, float) or isinstance(mut_failure, float)):
                            total_comparisons += 1
                            sub_dict = model_dict.get(threshold, {})
                            sub_dict['comparisons'] = sub_dict.get('comparisons',0) + 1
                            model_dict[threshold] = sub_dict

                        if (
                            isinstance(no_mut_failure, float) and 
                            isinstance(mut_failure, str) 
                            ) or (
                            isinstance(mut_failure, float) and 
                            isinstance(no_mut_failure, str)  
                        ):
                            if 'qwen' in models.lower() and benchmark == "CodeMMLU" and threshold =="":
                                # print(count2-1, no_mut_failure, mut_failure)
                                count += 1

                            total_inconsistencies += 1
                            sub_dict = model_dict.get(threshold, {})
                            sub_dict['inconsistencies'] = sub_dict.get('inconsistencies',0) + 1
                            model_dict[threshold] = sub_dict

                try:
                    res_df.loc[f"{threshold}", f"{models}_{benchmark}_{task}"] = f"{total_inconsistencies}/{total_comparisons} = {round(total_inconsistencies*100/total_comparisons, 2)}"
                except ZeroDivisionError:
                    res_df.loc[f"{threshold}", f"{models}_{benchmark}_{task}"] = f"{total_inconsistencies}/{total_comparisons} = 0"
                try:
                    acc_df.loc[f"{threshold}", f"{models}_{benchmark}_{task}"] = f"{total_correct}/{total_answered} = {round(total_correct*100/total_answered, 2)}"
                except ZeroDivisionError:
                    res_df.loc[f"{threshold}", f"{models}_{benchmark}_{task}"] = f"{total_correct}/{total_answered} = 0"
            

        
        inconsistency_dict[models] = model_dict
        accuracy_dict[models] = model_acc_dict


new_df_index = [
    "gemma-3-12b-it_HumanEval_input_prediction",
    "gemma-3-12b-it_CruxEval_input_prediction",
    "gemma-3-12b-it_CodeMMLU_mcq_inconsistency",
    "Qwen2.5-Coder-14B-Instruct_HumanEval_input_prediction",
    "Qwen2.5-Coder-14B-Instruct_CruxEval_input_prediction",
    "Qwen2.5-Coder-14B-Instruct_CodeMMLU_mcq_inconsistency",
    "LLama-3.1-8B_HumanEval_input_prediction",
    "LLama-3.1-8B_CruxEval_input_prediction",
    "LLama-3.1-8B_CodeMMLU_mcq_inconsistency"
]


res_df = res_df.reindex(new_df_index, axis=1)
acc_df = acc_df.reindex(new_df_index, axis = 1)

In [None]:
from collections import defaultdict

def merge_by_confidence(data):
    merged = defaultdict(lambda: {"inconsistencies": 0, "comparisons": 0})
    
    for model, conf_dict in data.items():
        for conf, stats in conf_dict.items():
            merged[conf]["inconsistencies"] += stats.get("inconsistencies", 0)
            merged[conf]["comparisons"] += stats.get("comparisons", 0)
    
    return dict(merged)

# Example usage
merged_data1 = merge_by_confidence(inconsistency_dict)

## Inconsistency of models (gemma, qwen, llama) at each confidence threshold

In [None]:
for threshold, res_dict in merged_data1.items():
    inconsistencies = res_dict['inconsistencies']
    comparisons = res_dict['comparisons']

    res_df.loc[f"{threshold}", f"Aggregated"] = f"{inconsistencies}/{comparisons} = {round(inconsistencies*100/comparisons, 2)}"

print(res_df.to_string())

## Accuracy of models (gemma, qwen, llama) at each confidence threshold

In [None]:
def merge_by_confidence(data):
    merged = defaultdict(lambda: {"answered": 0, "correct": 0})
    
    for model, conf_dict in data.items():
        for conf, stats in conf_dict.items():
            merged[conf]["answered"] += stats.get("answered", 0)
            merged[conf]["correct"] += stats.get("correct", 0)
    
    return dict(merged)


merged_data2 = merge_by_confidence(accuracy_dict)


for threshold, res_dict in merged_data2.items():
    answered = res_dict['answered']
    correct = res_dict['correct']
    # print(answered)

    acc_df.loc[f"{threshold}", f"Aggregated"] = f"{correct}/{answered} = {round(correct*100/answered, 2)}"

print(acc_df.to_string())