# MuCoCo RQ2 Experiment Results Aggregation

This notebook is used to aggregate the results for MuCoCo RQ2 experiments. The results are stored in MuCoCo_results/MuCoCo_experiment_results/ in the project root folder. The final aggregated results from this notebook are used in tables X (Effectiveness of MuCoCo VS Turbulence using the Turbulence Dataset).

In [None]:
import os
import sys
import pandas as pd
from typing import Dict, List

In [None]:
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(parent_dir)
sys.path.append(proj_dir)

In [None]:
from baseline.turbulence_benchmark.utility.turbulence_log_functions import TurbulenceLogHelper

In [None]:
def obtain_aggregated_results(d: List[Dict]) -> Dict[str, int]:
    results = {
        "correct": 0,
        "num_tasks": 0,
        "num_questions": 0,
        "inconsistencies": 0,
        "total_comparisons": 0,
        "question_inconsistencies": 0
    }

    for res in d:
        results["correct"] += res.get("correct_instances", 0)
        results["num_tasks"] += res.get("correct_instances", 0) + res.get("incorrect_instances", 0)
        results["num_questions"] += res.get("total_questions", 0)
        results["inconsistencies"] += res.get("inconsistency_count", 0)
        results["total_comparisons"] += res.get("total_comparisons", 0)
        results['question_inconsistencies'] += res.get("inconsistent_qn_count", 0)

    print(results)

    return results


In [None]:
def generate_results_table(res_dir: str):

    """
    This method assumes that all logs are filled. 
    """

    csv_logs = [f for f in os.listdir(res_dir) if 
        os.path.isfile(os.path.join(res_dir, f)) and 
        f.endswith(".csv") and 
        "turbulence" in f.lower()
        ]
        
    no_mutation_log_name = [f for f in csv_logs if "no_mutation" in f][-1]
    random_log_name = [f for f in csv_logs if "random" in f][-1]
    sequential_log_name = [f for f in csv_logs if "sequential" in f][-1]      

    if "code_generation" not in res_dir:

        for2while_log_name = [f for f in csv_logs if "for2while" in f][-1]      
        for2enumerate_log_name = [f for f in csv_logs if "for2enumerate" in f][-1]
        literal_format_log_name = [f for f in csv_logs if "literal_format" in f][-1]
        boolean_literal_log_name = [f for f in csv_logs if "boolean_literal" in f][-1]
        commutative_reorder_log_name = [f for f in csv_logs if "commutative_reorder" in f][-1]
        demorgan_log_name = [f for f in csv_logs if "demorgan" in f][-1]
        const_unfold_log_name = [f for f in csv_logs if "constant_unfold" in f and not any(suffix in f for suffix in ["constant_unfold_add", "constant_unfold_mult"])][-1]
        const_unfold_multi_log_name = [f for f in csv_logs if "constant_unfold_add" in f][-1]
        const_unfold_add_log_name = [f for f in csv_logs if "constant_unfold_mult" in f][-1]
        
        log_names = [no_mutation_log_name, random_log_name, sequential_log_name, for2while_log_name, for2enumerate_log_name, literal_format_log_name, boolean_literal_log_name, commutative_reorder_log_name, demorgan_log_name, const_unfold_log_name, const_unfold_add_log_name, const_unfold_multi_log_name]
        
    else:
        log_names = [no_mutation_log_name, random_log_name, sequential_log_name]
    

    helper = TurbulenceLogHelper()

    res_df = pd.DataFrame()

    total_dict = []


    for log_name in log_names:
        # print(log_name)
        
        log = pd.read_csv(os.path.join(res_dir, log_name))

        log_dict: Dict = helper.obtain_turbulence_code_inconsistency_score(log)
        log_qn_inconsistency_dict = helper.obtain_question_inconsistency_count(log = log)
        
        log_dict.update(log_qn_inconsistency_dict)

        if 'no_mutation' not in log_name:
            total_dict.append(log_dict)
        else:
            res_df.loc[log_name.capitalize().split(".csv")[0],"#errs"] = f"{log_dict['inconsistency_count']}"
            res_df.loc[log_name.capitalize().split(".csv")[0],"#tests"] = f"{log_dict['total_comparisons']}"
            res_df.loc[log_name.capitalize().split(".csv")[0],"Code Inconsistency Score"] = f"{log_dict['inconsistency_count']}/{log_dict['total_comparisons']} ({round(log_dict['inconsistency_count']*100 / log_dict['total_comparisons'], 2)}%)"


    # print(lexical_dicts, logical_dicts)

    title = "MuCoCo"

    dict_df = obtain_aggregated_results(total_dict)
    inconsistencies = dict_df.get("inconsistencies", 0)
    total_comparisons = dict_df.get("total_comparisons", 0)
    
    inconsistency_pct = round(inconsistencies * 100 / total_comparisons, 2)

    # Write to DataFrame
    res_df.loc[title,"#errs"] = f"{inconsistencies}"
    res_df.loc[title,"#tests"] = f"{total_comparisons}"
    res_df.loc[title, "Code Inconsistency Score"] = f"{inconsistencies}/{total_comparisons} ({inconsistency_pct}%)"
        
    return res_df





## Code Generation Turbulence VS MuCoCo

In [None]:
current_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(current_dir, "..",))
res_dir = os.path.join(proj_dir, "MuCoCO_experiment_results/code_generation/gpt-4o")

res_df = generate_results_table(res_dir=res_dir)
print(res_df.to_string())


## Input Prediction Turbulence VS MuCoCo

In [None]:
res_dir = os.path.join(proj_dir, "MuCoCO_experiment_results/input_prediction/gpt-4o")

res_df = generate_results_table(res_dir=res_dir)
print(res_df.to_string())


## Output Prediction Turbulence VS MuCoCo

In [None]:
res_dir = os.path.join(proj_dir, "MuCoCO_experiment_results/output_prediction/gpt-4o")

res_df = generate_results_table(res_dir=res_dir)
print(res_df.to_string())
