# MuCoCo RQ 1 Experiment Results Aggregation

This notebook is used to aggregate the results for MuCoCo RQ1 experiments. The results are stored in MuCoCo_results/MuCoCo_experiment_results/ in the project root folder. The final aggregated results from this notebook are used in tables VI (aggregating across model), VII (aggregating across tasks) and VIII (aggregating across benchmarks). 

In [None]:
import os
import sys
import pandas as pd
from typing import Tuple, Dict

In [None]:
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(parent_dir)
sys.path.append(proj_dir)

In [None]:
from utility.data_log_functions import DataLogHelper

In [None]:
def standardize_two_df(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    common_ids = set(df1["task_id"]) & set(df2["task_id"])
    if not common_ids:
        print("⚠️ No matching task_ids found between the two DataFrames.")
        return df1.iloc[0:0], df2.iloc[0:0]  # return empty aligned frames

    df1_filtered = df1[df1["task_id"].isin(common_ids)].copy()
    df2_filtered = df2[df2["task_id"].isin(common_ids)].copy()

    df1_filtered = df1_filtered.drop_duplicates(subset=["task_id"], keep="first")
    df2_filtered = df2_filtered.drop_duplicates(subset=["task_id"], keep="first")

    df1_filtered = df1_filtered.sort_values("task_id").reset_index(drop=True)
    df2_filtered = df2_filtered.sort_values("task_id").reset_index(drop=True)

    return df1_filtered, df2_filtered

In [None]:
def compare_multiple_code_generation_logs(res_dir: str, filter: Tuple[str] = (), anti_filter: Tuple[str] = ()):
    
    if filter is None:
        csv_logs = [f for f in os.listdir(res_dir) if (os.path.isfile(os.path.join(res_dir, f)) and f.endswith(".csv"))]
    else:
        csv_logs = [f for f in os.listdir(res_dir) if (
            os.path.isfile(os.path.join(res_dir, f)) and 
            f.endswith(".csv") and 
            all(sub in f for sub in filter)) and
            all(sub not in f for sub in anti_filter)
            ]

    log_file_names = [csv_file_name.replace('.csv', '') for csv_file_name in csv_logs]

    results_df = pd.DataFrame(columns=log_file_names, index = log_file_names)
    for file_name in log_file_names:
        results_df.loc[file_name, file_name] = float('nan')

    while len(csv_logs) > 0:
        log1_file_name = csv_logs.pop()
        for log2_file_name in csv_logs:
            log1_file_path = os.path.join(res_dir, log1_file_name)
            log2_file_path = os.path.join(res_dir, log2_file_name)

            log1 = pd.read_csv(log1_file_path)
            log2 = pd.read_csv(log2_file_path) 
            log1, log2 = standardize_two_df(log1, log2)
            log1_inconsistencies, log2_inconsistencies = DataLogHelper.compare_code_generation_dataframe_results(log1=log1, log2=log2)

            results_df.loc[log1_file_name.replace('.csv', ''), log2_file_name.replace('.csv', '')] = log1_inconsistencies
            results_df.loc[log2_file_name.replace('.csv', ''), log1_file_name.replace('.csv', '')] = log2_inconsistencies

    return results_df

In [None]:
def clean_up_csv_name(file_name: str)-> str:
    mutation_type = file_name.split("shot_")[-1]
    if "_" in mutation_type:
        mutation = mutation_type.replace("_", " ").title()
        return mutation
    return mutation_type.capitalize()

In [None]:
def obtain_category(log_name:str) -> str | None:
    mutation_categories = {
        "Lexical": [
            "literal_format",
            "random",
            "sequential"
        ],
        "Syntactic": [
            "for2while",
            "for2enumerate"
        ],
        "Logical": [
            "boolean_literal",
            "constant_unfold",
            "constant_unfold_add",
            "constant_unfold_mult",
            "demorgan",
            "commutative_reorder"
        ]
    }

    for cat, mut in mutation_categories.items():
        for m in mut:
            if m in log_name:
                return cat

    return None

In [None]:
def compare_logs_against_no_mutation(res_dir: str, filter: Tuple[str] = (), anti_filter: Tuple[str] = ()):
    
    if filter is None:
        csv_logs = [f for f in os.listdir(res_dir) if (os.path.isfile(os.path.join(res_dir, f)) and f.endswith(".csv"))]
    else:
        csv_logs = [f for f in os.listdir(res_dir) if (
            os.path.isfile(os.path.join(res_dir, f)) and 
            f.endswith(".csv") and 
            all(sub in f for sub in filter)) and
            all(sub not in f for sub in anti_filter)
            ]

    csv_logs.sort()
    target_log_name = [l for l in csv_logs if "no_mutation" in l][-1]
    csv_logs.pop(csv_logs.index(target_log_name))
    target_log_path = os.path.join(res_dir, target_log_name)
    target_log = pd.read_csv(target_log_path)

    results_df = pd.DataFrame()

    total_inconsistencies = 0
    total_questions = 0
    total_success = 0
    total_answered = 0

    category_dict = {}
    mutation_dict = {}

    for log_name in csv_logs:
        # print(log_name)

        log_category = obtain_category(log_name)
                
        log2_file_path = os.path.join(res_dir, log_name)
        log2 = pd.read_csv(log2_file_path) 

        target_log, log2 = standardize_two_df(target_log, log2)

        inconsistency_dict = DataLogHelper.compare_code_generation_dataframe_results(log1=target_log, log2=log2)

        # Adding results into the dataframe
        cleaned_mutation_name = clean_up_csv_name(log_name.replace('.csv', ''))
        results_df.loc[cleaned_mutation_name, "Inconsistency Score"] = f"{inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies']}/{inconsistency_dict['total_inconsistency_questions']} ({round((inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies'])*100/inconsistency_dict['total_inconsistency_questions'], 2)}%)"
        results_df.loc['No Mutation', "Inconsistency Score"] = "N/A"
        results_df.loc['No Mutation', "Model Accuracy"] = f"{(inconsistency_dict['log1_success'])}/{inconsistency_dict['log1_total_answered']} ({round((inconsistency_dict['log1_success'])*100/inconsistency_dict['log1_total_answered'], 2)}%)"
        results_df.loc[cleaned_mutation_name, "Model Accuracy"] = f"{(inconsistency_dict['log2_success'])}/{inconsistency_dict['log2_total_answered']} ({round((inconsistency_dict['log2_success'])*100/inconsistency_dict['log2_total_answered'], 2)}%)"

        if total_success == 0:
            total_success += inconsistency_dict['log1_success']
        
        if total_answered == 0:
            total_answered += inconsistency_dict['log1_total_answered']

        if 'model_ensemble' in log_name.lower() or "ensemble" not in log_name.lower() :
            total_inconsistencies += inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies']
            total_questions += inconsistency_dict['total_inconsistency_questions']
            total_success += inconsistency_dict['log2_success']
            total_answered += inconsistency_dict['log2_total_answered']

        
        if log_category:
            d: Dict = category_dict.get(log_category, {})
            d['total_inconsistencies'] = d.get('total_inconsistencies', 0) + inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies']
            d['total_questions'] = d.get('total_questions', 0) + inconsistency_dict['total_inconsistency_questions']
            d['total_success'] = d.get('total_success', 0) + inconsistency_dict['log2_success']
            d['total_answered'] = d.get('total_answered', 0) + inconsistency_dict['log2_total_answered']
            category_dict[log_category] = d

        # adding results in mutation_dict, with the mutation name as key
        mutation_dict[cleaned_mutation_name] = {
            'total_inconsistencies': inconsistency_dict['log1_inconsistencies']+ inconsistency_dict['log2_inconsistencies'],
            'total_questions': inconsistency_dict['total_inconsistency_questions'],
            'total_success': inconsistency_dict['log2_success'],
            'total_answered': inconsistency_dict['log2_total_answered']
        }
    
    results_df = pd.concat([
        results_df[results_df.index.str.lower().str.contains("no mutation")],

        results_df[
            ~results_df.index.str.lower().str.contains("ensemble") &
            ~results_df.index.str.lower().str.contains("no mutation")
        ],

        results_df[results_df.index.str.lower().str.contains("ensemble")]
    ])

    ## Adding aggregated second order results and atomic results
    for key, mut_dict in category_dict.items():
        mut_inconsistencies = mut_dict['total_inconsistencies']
        mut_questions = mut_dict['total_questions']
        mut_success = mut_dict['total_success']
        mut_answered = mut_dict['total_answered']
        results_df.loc[f"{key} Results", "Inconsistency Score"] = f"{mut_inconsistencies}/{mut_questions} ({round(mut_inconsistencies*100/mut_questions, 2)})"
        results_df.loc[f"{key} Results", "Model Accuracy"] = f"{mut_success}/{mut_answered} ({round(mut_success*100/mut_answered, 2)}%)"

        mutation_dict[f"{key} Results"] = mut_dict


    results_df.loc["Aggregated Results", "Inconsistency Score"] = f"{total_inconsistencies}/{total_questions} ({round(total_inconsistencies*100/total_questions, 2)})"
    results_df.loc["Aggregated Results", "Model Accuracy"] = f"{total_success}/{total_answered} ({round(total_success*100/total_answered, 2)}%)"


    return [
        results_df, 
        category_dict, 
        ]

In [None]:
model_dict = {
    "Qwen2.5-Coder-14B-Instruct" : "Qwen2.5-Coder-14B-Instruct",
    "gemma-3-12b-it": "Gemma-3-12b-it",
    "deepseek-reasoner": "DeepSeek-V3.2-Exp (Non-thinking Mode)",
    "LLama-3.1-8B": "LLama-3.1-8B",
    "gpt-5" : "GPT-5",
    "gpt-4o": "GPT-4o",
    "codestral-latest": "codestral-2508",
}

current_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(current_dir, ".."))

def obtain_benchmark_task_csv(benchmark: str, task: str) -> pd.DataFrame:

    final_df = pd.DataFrame()  # start with an empty DataFrame
    final_dict = {}

    # Iterating through each model in model_dict
    for k, m in model_dict.items():
        # print(k)
        res_dir = os.path.join(proj_dir, f"MuCoCo_experiment_results/{task}/{k}")
        try:
            res, category_dict = compare_logs_against_no_mutation(res_dir=res_dir, filter=(benchmark, ))

        except FileNotFoundError:
            print(f"{res_dir} does not exist.")
            continue

        res_df = pd.DataFrame(res)

        res_df = res_df.add_prefix(f"{m} ")

        if final_df.empty:
            final_df = res_df
        else:
            final_df = pd.concat([final_df, res_df], axis=1)

        final_dict[m] = category_dict

    return final_df, final_dict


In [None]:
from tqdm import tqdm
import copy


tasks = {
    'mcq_inconsistency': ['CodeMMLU'],
    'input_prediction': ['HumanEval', "CruxEval"],
    'output_prediction': ['HumanEval', "CruxEval"],
    'code_generation': ['BigCodeBench', "HumanEval"],
}

task_dict = {}
overall_dict = {}
all_benchmark_dict = {}
dfs = []

def combine_two_dictionaries(d1: dict, d2: dict) -> dict:
    out = copy.deepcopy(d1)
    for k, inner2 in d2.items():
        if k not in out:
            out[k] = copy.deepcopy(inner2)              
        else:
            for kk, vv in inner2.items():
                out[k][kk] = out[k].get(kk, 0) + vv
    return out


for task, benchmarks in tqdm(tasks.items()):
    # Dictionary for storing results to aggregate by task
    task_d = {}

    print(f"Aggregating for {task} logs")
    for benchmark in benchmarks:

        print(f"Working on {benchmark} now...")
        final_df, aggregated_dict = obtain_benchmark_task_csv(benchmark, task)

        benchmark_dict = {}

        for model, mut_cat_dict in aggregated_dict.items():
            if "ensemble" in model:
                continue

            for mut_cat, res_dir in mut_cat_dict.items():
                # make a NEW dict here instead of aliasing res_dir
                if not benchmark_dict.get(mut_cat, None):
                    benchmark_dict[mut_cat] = res_dir.copy()
                else:
                    for key, val in res_dir.items():
                        benchmark_dict[mut_cat][key] += val

            d1 = task_d.get(task, {})
            if not d1:
                task_d[task] = copy.deepcopy(mut_cat_dict)
            else:
                task_d[task] = combine_two_dictionaries(d1, mut_cat_dict)
                
        # building benchmark dict for aggregating results by benchmark
        d = all_benchmark_dict.get(benchmark, {})
        if not d:
            all_benchmark_dict[benchmark] = benchmark_dict
        else:
            new_d =  combine_two_dictionaries(d, benchmark_dict) 
            all_benchmark_dict[benchmark] = new_d

        # building overall dictionary for aggregating results by models
        if not overall_dict:
            overall_dict = aggregated_dict
        else:
            for model_name, dict1 in overall_dict.items():
                dict2 = aggregated_dict[model_name]
                overall_dict[model_name] = combine_two_dictionaries(dict1, dict2)
    
    
    task_dict[task] = task_d[task]



# Aggregated MuCoCo Results Aggregated Across Benchmarks (Table VIII)

In [None]:
import copy

benchmark_df = pd.DataFrame()
all_cat_dict = {}


for benchmark, mut_cat_dict in all_benchmark_dict.items():
    benchmark_inconsistencies = 0
    benchmark_questions = 0
    benchmark_success = 0
    benchmark_answered = 0

    for mut_cat, res_dir in mut_cat_dict.items():
        if not res_dir:
            continue

        # Make a defensive copy so we don’t mutate shared references
        res_dir = copy.deepcopy(res_dir)

        mut_inconsistencies = res_dir['total_inconsistencies']
        mut_questions = res_dir['total_questions']
        mut_success = res_dir['total_success']
        mut_answered = res_dir['total_answered']

        benchmark_df.loc[mut_cat, f"{benchmark} Inconsistencies"] = f"{mut_inconsistencies}/{mut_questions} ({round(mut_inconsistencies*100/mut_questions, 2)})"
        benchmark_df.loc[mut_cat, f"{benchmark} Accuracy"] = f"{mut_success}/{mut_answered} ({round(mut_success*100/mut_answered, 2)}%)"

        benchmark_inconsistencies += mut_inconsistencies
        benchmark_questions += mut_questions
        benchmark_success += mut_success
        benchmark_answered += mut_answered

        d = all_cat_dict.get(mut_cat, {})
        if not d:
            all_cat_dict[mut_cat] = copy.deepcopy(res_dir)
        else:
            for key, value in d.items():
                d[key] += res_dir[key]
            all_cat_dict[mut_cat] = d


    benchmark_df.loc["Aggregated Results", f"{benchmark} Inconsistencies"] = f"{benchmark_inconsistencies}/{benchmark_questions} ({round(benchmark_inconsistencies*100/benchmark_questions, 2)})"
    benchmark_df.loc["Aggregated Results", f"{benchmark} Accuracy"] = f"{benchmark_success}/{benchmark_answered} ({round(benchmark_success*100/benchmark_answered, 2)}%)"

for mut_cat, res_dict in all_cat_dict.items():
    mut_inconsistencies = res_dict['total_inconsistencies']
    mut_questions = res_dict['total_questions']
    mut_success = res_dict['total_success']
    mut_answered = res_dict['total_answered']
    benchmark_df.loc[mut_cat, "Aggregated Mutation Inc."] =  f"{mut_inconsistencies}/{mut_questions} ({round(mut_inconsistencies*100/mut_questions, 2)})"
    benchmark_df.loc[mut_cat, "Aggregated Mutation Acc."] =  f"{mut_success}/{mut_answered} ({round(mut_success*100/mut_answered, 2)})"
    

print(benchmark_df.to_string())

# MuCoCo Results Aggregated Across Tasks (Table VII)

In [None]:
import copy

task_df = pd.DataFrame()
all_cat_dict = {}


for task, mut_cat_dict in task_dict.items():

    benchmark_inconsistencies = 0
    benchmark_questions = 0
    benchmark_success = 0
    benchmark_answered = 0

    for mut_cat, res_dir in mut_cat_dict.items():
        if not res_dir:
            continue

        # Make a defensive copy so we don’t mutate shared references
        res_dir = copy.deepcopy(res_dir)

        mut_inconsistencies = res_dir['total_inconsistencies']
        mut_questions = res_dir['total_questions']
        mut_success = res_dir['total_success']
        mut_answered = res_dir['total_answered']

        task_df.loc[mut_cat, f"{task} Inconsistencies"] = f"{mut_inconsistencies}/{mut_questions} ({round(mut_inconsistencies*100/mut_questions, 2)})"
        task_df.loc[mut_cat, f"{task} Accuracy"] = f"{mut_success}/{mut_answered} ({round(mut_success*100/mut_answered, 2)}%)"

        benchmark_inconsistencies += mut_inconsistencies
        benchmark_questions += mut_questions
        benchmark_success += mut_success
        benchmark_answered += mut_answered

        d = all_cat_dict.get(mut_cat, {})
        if not d:
            all_cat_dict[mut_cat] = copy.deepcopy(res_dir)
        else:
            for key, value in d.items():
                d[key] += res_dir[key]
            all_cat_dict[mut_cat] = d


    task_df.loc["Aggregated Results", f"{task} Inconsistencies"] = f"{benchmark_inconsistencies}/{benchmark_questions} ({round(benchmark_inconsistencies*100/benchmark_questions, 2)})"
    task_df.loc["Aggregated Results", f"{task} Accuracy"] = f"{benchmark_success}/{benchmark_answered} ({round(benchmark_success*100/benchmark_answered, 2)}%)"


for mut_cat, res_dict in all_cat_dict.items():
    mut_inconsistencies = res_dict['total_inconsistencies']
    mut_questions = res_dict['total_questions']
    mut_success = res_dict['total_success']
    mut_answered = res_dict['total_answered']
    task_df.loc[mut_cat, "Aggregated Mutation Inc."] =  f"{mut_inconsistencies}/{mut_questions} ({round(mut_inconsistencies*100/mut_questions, 2)})"
    task_df.loc[mut_cat, "Aggregated Mutation Acc."] =  f"{mut_success}/{mut_answered} ({round(mut_success*100/mut_answered, 2)})"
    

print(task_df.to_string())

# Aggregating MuCoCo results across models (Table VI)

In [None]:
import pandas as pd

data = overall_dict
categories = ["Logical", "Syntactic", "Lexical"]
models = list(data.keys())

rows = []

# keep track of total inconsistencies per model
model_inconsistency_totals = {model: 0 for model in models}

for cat in categories:
    row = {"Category": cat}
    cat_inconsistency = 0
    cat_total_questions = 0
    cat_total_success = 0
    cat_total_answered = 0

    for model in models:
        vals = data[model][cat]

        # string representation for reporting
        inc = f'{vals["total_inconsistencies"]}/{vals["total_questions"]} = {round(vals["total_inconsistencies"] / vals["total_questions"] * 100, 2)}'
        acc = f'{vals["total_success"]}/{vals["total_answered"]} = {round(vals["total_success"] / vals["total_answered"] * 100, 2)}'

        row[f"{model} Inconsistency"] = inc
        row[f"{model} Accuracy"] = acc

        # accumulate for averages
        if "ensemble" not in model:
            cat_inconsistency += vals["total_inconsistencies"]
            cat_total_questions += vals["total_questions"]
            cat_total_success += vals["total_success"]
            cat_total_answered += vals["total_answered"]

        # accumulate for global weightage
        model_inconsistency_totals[model] += vals["total_inconsistencies"]

    # per-category average
    row["Average Inconsistency"] = f"{cat_inconsistency}/{cat_total_questions} = {round(cat_inconsistency*100 / cat_total_questions, 2)}"
    row["Average Accuracy"] = f"{cat_total_success}/{cat_total_answered} = {round(cat_total_success*100 / cat_total_answered, 2)}"
    rows.append(row)

avg_row = {"Category": "All"}
sums = {col: {"num": 0, "den": 0} for col in rows[0].keys() if col != "Category"}

for row in rows:
    for col in sums.keys():
        val = row[col]
        if isinstance(val, str) and "/" in val:
            try:
                frac_part = val.split('=')[0].strip()
                num, den = frac_part.split('/')
                num, den = int(num.strip()), int(den.strip())
                sums[col]["num"] += num
                sums[col]["den"] += den
            except Exception:
                continue

for col, vals in sums.items():
    num, den = vals["num"], vals["den"]
    if den > 0:
        avg_row[col] = f"{num}/{den} = {round(num * 100 / den, 2)}"
    else:
        avg_row[col] = "0/0 = 0.0"

rows.append(avg_row)

df = pd.DataFrame(rows)
print(df.to_string())


# Formulation of Model Weights used for weighted model ensemble

In [None]:

import numpy as np
valid_models = [m for m in models if "ensemble" not in m]

raw = np.array([model_inconsistency_totals[m] for m in valid_models], dtype=float)

inv = 1 / raw

weights = inv / np.sum(inv)

df_weights = pd.DataFrame({
    "Model": valid_models,
    "Inverse Weight": np.round(weights, 6)
}).sort_values(by="Inverse Weight", ascending=False).reset_index(drop=True)

print("Sum of weights:", np.sum(df_weights["Inverse Weight"]))
print(df_weights.to_string())