# MuCoCo RQ4 Second Order Results Aggregation

This notebook is used to aggregate the results for MuCoCo second order experiments. The experiment results are stored in MuCoCo_results/MuCoCo_experiment_results/second_order from the project directory. The results are reflected in Table XII (MUCOCO’s Scalability to multiple (2) mutations vs. atomic mutations).

In [None]:
import os
import sys
import pandas as pd
from typing import Tuple, Dict

In [None]:
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(parent_dir)
sys.path.append(proj_dir)

In [None]:
from utility.data_log_functions import DataLogHelper

In [None]:
def standardize_two_df(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    common_ids = set(df1["task_id"]) & set(df2["task_id"])
    if not common_ids:
        print("⚠️ No matching task_ids found between the two DataFrames.")
        return df1.iloc[0:0], df2.iloc[0:0]  # return empty aligned frames

    df1_filtered = df1[df1["task_id"].isin(common_ids)].copy()
    df2_filtered = df2[df2["task_id"].isin(common_ids)].copy()

    df1_filtered = df1_filtered.drop_duplicates(subset=["task_id"], keep="first")
    df2_filtered = df2_filtered.drop_duplicates(subset=["task_id"], keep="first")

    df1_filtered = df1_filtered.sort_values("task_id").reset_index(drop=True)
    df2_filtered = df2_filtered.sort_values("task_id").reset_index(drop=True)

    return df1_filtered, df2_filtered

In [None]:
def compare_multiple_code_generation_logs(res_dir: str, filter: Tuple[str] = (), anti_filter: Tuple[str] = ()):
    
    if filter is None:
        csv_logs = [f for f in os.listdir(res_dir) if (os.path.isfile(os.path.join(res_dir, f)) and f.endswith(".csv"))]
    else:
        csv_logs = [f for f in os.listdir(res_dir) if (
            os.path.isfile(os.path.join(res_dir, f)) and 
            f.endswith(".csv") and 
            all(sub in f for sub in filter)) and
            all(sub not in f for sub in anti_filter)
            ]

    log_file_names = [csv_file_name.replace('.csv', '') for csv_file_name in csv_logs]

    results_df = pd.DataFrame(columns=log_file_names, index = log_file_names)
    for file_name in log_file_names:
        results_df.loc[file_name, file_name] = float('nan')

    while len(csv_logs) > 0:
        log1_file_name = csv_logs.pop()
        for log2_file_name in csv_logs:
            log1_file_path = os.path.join(res_dir, log1_file_name)
            log2_file_path = os.path.join(res_dir, log2_file_name)

            log1 = pd.read_csv(log1_file_path)
            log2 = pd.read_csv(log2_file_path) 
            print(log1_file_name, log2_file_name)
            log1, log2 = standardize_two_df(log1, log2)
            log1_inconsistencies, log2_inconsistencies = DataLogHelper.compare_code_generation_dataframe_results(log1=log1, log2=log2)

            results_df.loc[log1_file_name.replace('.csv', ''), log2_file_name.replace('.csv', '')] = log1_inconsistencies
            results_df.loc[log2_file_name.replace('.csv', ''), log1_file_name.replace('.csv', '')] = log2_inconsistencies

    return results_df

In [None]:
def clean_up_csv_name(file_name: str)-> str:
    mutation_type = file_name.split("shot_")[-1]
    if "_" in mutation_type:
        mutation = mutation_type.replace("_", " ").title()
        return mutation
    return mutation_type.capitalize()

In [None]:
def obtain_category(log_name:str) -> str | None:

    second_order_mutation = {
        "Second Order": [
            "for2while_random",
            "for2while_constant_unfold",
            "constant_unfold_random"
        ]    
    }

    for cat, mut in second_order_mutation.items():
        for m in mut:
            if m in log_name:
                return cat
        
    else:
        return "Atomic"


In [None]:
def compare_logs_against_no_mutation(res_dir: str, filter: Tuple[str] = (), anti_filter: Tuple[str] = ()):
    
    if filter is None:
        csv_logs = [f for f in os.listdir(res_dir) if (os.path.isfile(os.path.join(res_dir, f)) and f.endswith(".csv"))]
    else:
        csv_logs = [f for f in os.listdir(res_dir) if (
            os.path.isfile(os.path.join(res_dir, f)) and 
            f.endswith(".csv") and 
            all(sub in f for sub in filter)) and
            all(sub not in f for sub in anti_filter)
            ]

    csv_logs.sort()
    target_log_name = [l for l in csv_logs if "no_mutation" in l][-1]
    csv_logs.pop(csv_logs.index(target_log_name))
    target_log_path = os.path.join(res_dir, target_log_name)
    target_log = pd.read_csv(target_log_path)

    results_df = pd.DataFrame()

    total_inconsistencies = 0
    total_questions = 0
    total_success = 0
    total_answered = 0

    category_dict = {}
    mutation_dict = {}

    for log_name in csv_logs:
        # print(log_name)

        log_category = obtain_category(log_name)
                
        log2_file_path = os.path.join(res_dir, log_name)
        log2 = pd.read_csv(log2_file_path) 

        inconsistency_dict = DataLogHelper.compare_code_generation_dataframe_results(log1=target_log, log2=log2)

        # Adding results into the dataframe
        cleaned_mutation_name = clean_up_csv_name(log_name.replace('.csv', ''))
        results_df.loc[cleaned_mutation_name, "Inconsistency Score"] = f"{inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies']}/{inconsistency_dict['total_inconsistency_questions']} ({round((inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies'])*100/inconsistency_dict['total_inconsistency_questions'], 2)}%)"

        if 'No Mutation' in results_df.index:
            pass
        else:
            results_df.loc['No Mutation', "Inconsistency Score"] = "N/A"
            results_df.loc['No Mutation', "Model Accuracy"] = f"{(inconsistency_dict['log1_success'])}/{inconsistency_dict['log1_total_answered']} ({round((inconsistency_dict['log1_success'])*100/inconsistency_dict['log1_total_answered'], 2)}%)"
        results_df.loc[cleaned_mutation_name, "Model Accuracy"] = f"{(inconsistency_dict['log2_success'])}/{inconsistency_dict['log2_total_answered']} ({round((inconsistency_dict['log2_success'])*100/inconsistency_dict['log2_total_answered'], 2)}%)"
        
        if total_success == 0:
            total_success += inconsistency_dict['log1_success']
        
        if total_answered == 0:
            total_answered += inconsistency_dict['log1_total_answered']

        if 'model_ensemble' in log_name.lower() or "ensemble" not in log_name.lower() :
            total_inconsistencies += inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies']
            total_questions += inconsistency_dict['total_inconsistency_questions']
            total_success += inconsistency_dict['log2_success']
            total_answered += inconsistency_dict['log2_total_answered']

        
        if log_category:
            d: Dict = category_dict.get(log_category, {})
            d['total_inconsistencies'] = d.get('total_inconsistencies', 0) + inconsistency_dict['log1_inconsistencies'] + inconsistency_dict['log2_inconsistencies']
            d['total_questions'] = d.get('total_questions', 0) + inconsistency_dict['total_inconsistency_questions']
            d['total_success'] = d.get('total_success', 0) + inconsistency_dict['log2_success']
            d['total_answered'] = d.get('total_answered', 0) + inconsistency_dict['log2_total_answered']
            category_dict[log_category] = d


        # adding results in mutation_dict, with the mutation name as key
        mutation_dict[cleaned_mutation_name] = {
            'total_inconsistencies': inconsistency_dict['log1_inconsistencies']+ inconsistency_dict['log2_inconsistencies'],
            'total_questions': inconsistency_dict['total_inconsistency_questions'],
            'total_success': inconsistency_dict['log2_success'],
            'total_answered': inconsistency_dict['log2_total_answered']
        }
    
    results_df = pd.concat([
        results_df[results_df.index.str.lower().str.contains("no mutation")],

        results_df[
            ~results_df.index.str.lower().str.contains("ensemble") &
            ~results_df.index.str.lower().str.contains("no mutation")
        ],

        results_df[results_df.index.str.lower().str.contains("ensemble")]
    ])

    ## Adding aggregated second order results and atomic results
    for key, mut_dict in category_dict.items():
        mut_inconsistencies = mut_dict['total_inconsistencies']
        mut_questions = mut_dict['total_questions']
        mut_success = mut_dict['total_success']
        mut_answered = mut_dict['total_answered']
        results_df.loc[f"{key} Results", "Inconsistency Score"] = f"{mut_inconsistencies}/{mut_questions} ({round(mut_inconsistencies*100/mut_questions, 2)})"
        results_df.loc[f"{key} Results", "Model Accuracy"] = f"{mut_success}/{mut_answered} ({round(mut_success*100/mut_answered, 2)}%)"

        mutation_dict[f"{key} Results"] = mut_dict


    results_df.loc["Aggregated Results", "Inconsistency Score"] = f"{total_inconsistencies}/{total_questions} ({round(total_inconsistencies*100/total_questions, 2)})"
    results_df.loc["Aggregated Results", "Model Accuracy"] = f"{total_success}/{total_answered} ({round(total_success*100/total_answered, 2)}%)"

    return [
        results_df, 
        category_dict, 
        ]

In [None]:
current_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(current_dir, "..",))

def obtain_benchmark_task_csv(benchmark: str, task: str) -> pd.DataFrame:

    final_df = pd.DataFrame()  # start with an empty DataFrame

    res_dir = os.path.join(proj_dir, f"MuCoCo_experiment_results/second_order/{task}/")
    try:
        res, category_dict = compare_logs_against_no_mutation(res_dir=res_dir, filter=(benchmark, ))

    except FileNotFoundError:
        print(f"{res_dir} does not exist.")

    res_df = pd.DataFrame(res)

    if final_df.empty:
        final_df = res_df
    else:
        final_df = pd.concat([final_df, res_df], axis=1)

    # final_df.to_csv("combined_results.csv", index=True, header=True)
    return final_df, category_dict


In [None]:
from tqdm import tqdm
import copy


tasks = {
    'input_prediction': ['HumanEval', "CruxEval"],
    'output_prediction': ['HumanEval', "CruxEval"],
    'mcq_inconsistency': ['CodeMMLU'],
}

task_dict = {}
overall_dict = {}
all_benchmark_dict = {}
dfs = []

def combine_two_dictionaries(d1: dict, d2: dict) -> dict:
    # make it PURE (return a new merged dict)
    out = copy.deepcopy(d1)
    for k, inner2 in d2.items():
        if k not in out:
            out[k] = copy.deepcopy(inner2)             
        else:
            out[k] += inner2
    return out

task_df = pd.DataFrame()

for task, benchmarks in tqdm(tasks.items()):
    # Dictionary for storing results to aggregate by task
    task_d = {}

    print(f"Aggregating for {task} logs")
    for benchmark in benchmarks:

        print(f"Working on {benchmark} now...")
        final_df, aggregated_dict = obtain_benchmark_task_csv(benchmark, task)

        final_df = final_df.rename(columns={
            "Inconsistency Score": f"{task}_{benchmark}_inconsistency",
            "Model Accuracy": f"{task}_{benchmark}_accuracy"
        })

        if task_df.empty:
            task_df = final_df
        else:
            task_df = pd.concat([task_df, final_df], axis=1)

        benchmark_dict = {}

        for mut_cat, mut_cat_dict in aggregated_dict.items():
            # make a NEW dict here instead of aliasing res_dir
            if not benchmark_dict.get(mut_cat, None):
                benchmark_dict[mut_cat] = mut_cat_dict.copy()
            else:
                for key, val in mut_cat_dict.items():
                    benchmark_dict[mut_cat][key] += val

        d1 = task_d.get(task, {})

        if not d1:
            task_d[task] = copy.deepcopy(mut_cat_dict)
        else:
            task_d[task] = combine_two_dictionaries(d1, mut_cat_dict)    
    
    task_dict[task] = task_d[task]



# MuCoCo Results Aggregated Across Tasks

In [None]:

order = [
    "No Mutation", 
    "Random",
    "Constant Unfold",
    "For2while",
    "Constant Unfold Random",
    "For2While Random",
    "For2While Constant Unfold",
    "Atomic Results",
    "Second Order Results",
    "Aggregated Results"
]

task_df = task_df.reindex(order)


print(task_df.to_string())