## MongoDB MCQ Inconsistency Database Builder

This notebook builds the MCQ Inconsistency database using the CodeMMLU benchmark.

In [None]:
import os
import sys
from tqdm import tqdm
import pandas as pd
import multiprocessing
import string
from dotenv import load_dotenv

In [None]:
curr_dir = os.getcwd()
par_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(par_dir)
sys.path.append(proj_dir)
load_dotenv()

In [None]:
from database import MongoDBHelper
from mcq_inconsistency.utility.codemmlu_helper import CodeGenerationCodeMMLUHelper
from utility.constants import CodeMMLU

In [None]:
db = MongoDBHelper(max_retries = 5)
if db.check_database_connectivity():
    print("MongoDB connected")

In [None]:
base_qns_db = db.client[os.getenv('MONGODB_BENCHMARK_DATABASE')]
codemmlu_database = pd.read_csv(
    os.path.join(proj_dir, "datasets/open_ended_format/codemmlu_test.csv"),
    encoding="utf-8",
    header=0,
    )

humaneval_database = pd.read_csv(
    os.path.join(proj_dir, "datasets/open_ended_format/humaneval_test_modified_open.csv"),
    encoding="utf-8",
    header=0,
    quoting=1,           
    )

In [None]:
codemmlu_task = CodeMMLU.Tasks.CODE_COMPLETION
mcq_question_database = base_qns_db[os.getenv("MONGODB_CODEMMLU_COLLECTION")]

In [None]:
ans_dict = {
    "A" : 0,
    "B" : 1,
    "C" : 2,
    "D" : 3,
}

failed_to_upload = []
num_failed = 0

for i in tqdm(range(
    len(codemmlu_database)
    )):
    try:
        input_task_id = f"CodeMMLUMCQ{i - num_failed}"

        codemmlu_qn = codemmlu_database.iloc[i]
        question = codemmlu_qn['question']
        choices = codemmlu_qn['choices']
        expected_ans = codemmlu_qn['answer']
        original_id = codemmlu_qn['task_id']

        humaneval_qn = humaneval_database.iloc[i]
        test_suite = humaneval_qn['test']
        func_name = humaneval_qn['entry_point']
        humaneval_id = humaneval_qn['task_id']

        if isinstance(choices, str):
            choices = eval(choices)

        question, qn_desc = CodeGenerationCodeMMLUHelper.seperate_original_desciptions(question)
        qn_desc, examples = CodeGenerationCodeMMLUHelper.extract_examples(qn_desc)
        choices = [CodeGenerationCodeMMLUHelper._standardize_leading_whitespaces(choice) for choice in choices ]

        correct_choice = choices[ans_dict[expected_ans]]

        full_sol = question + "\n" + correct_choice

        #sanity check for codemmlu full solution
        test_suite = CodeGenerationCodeMMLUHelper.process_original_tests(test_suite)

        validate_full_sol = CodeGenerationCodeMMLUHelper.check_test_case(
            test_case = test_suite,
            code_snippet = full_sol,
            func_name = func_name
        )

        if not validate_full_sol:
            failed_to_upload.append(original_id)
            raise ValueError("Full Solution failed the test suite")
        
        ## Checking through other choices to ensure that they do NOT pass the check function        
        choice_dict = {}

        for idx, choice in enumerate(choices):
            choice_dict[string.ascii_uppercase[idx]] = choice
        
        keys_to_rem = []

        for key, choice in choice_dict.items():
            if key == expected_ans:
                continue
            
            test_sol = question + "\n" + choice

            try:
                multiprocessing_queue = multiprocessing.Queue()

                verify_answer_process = multiprocessing.Process(        
                target= CodeGenerationCodeMMLUHelper.run_llm_answer,
                args = (test_sol, test_suite, func_name, multiprocessing_queue)
                )

                verify_answer_process.start()
                verify_answer_process.join(timeout=5)

                if verify_answer_process.is_alive():
                    verify_answer_process.kill()
                    verify_answer_process.join()
                    raise RuntimeError("The mutated answer took too long to run, which could inidicate some sort of infinite loop")

                if not multiprocessing_queue.empty():
                    error = multiprocessing_queue.get()
                    raise error
                
                keys_to_rem.append(key)

            except RuntimeError as e:
                print(f"{input_task_id}: Option {key} ran for too long")
                continue

            except Exception as e:
                continue
        
        for key in keys_to_rem:
            if key < expected_ans:
                expected_ans = chr(ord(expected_ans)-1)
            del choice_dict[key]

        idx = 0
        new_dict = {}
        for key, choice in choice_dict.items():
            new_key = string.ascii_uppercase[idx]
            new_dict[new_key] = choice
            idx += 1
        

        if len(choice_dict.keys()) <= 1:
            num_failed += 1
            print(f"{original_id} was not uploaded as it only had 1 valid choice left.")
            continue

        database_entry = {
            "_id": input_task_id,
            "question": question,
            "qn_desc": qn_desc,
            "choices": new_dict,
            "check": test_suite,
            "answer": expected_ans,
            "examples": examples,
            "func_name": func_name,
            "original_id": original_id,
            "corresponding_humaneval_id": humaneval_id
        }

        exisiting_entry = mcq_question_database.find_one({"_id": input_task_id})

        if exisiting_entry is not None:
            mcq_question_database.find_one_and_replace({"_id": input_task_id}, database_entry)
        else:
            mcq_question_database.insert_one(database_entry)
        
        ## Next, we need to ensure that the entry stored in the DB
        db_entry = mcq_question_database.find_one(filter = {"_id": input_task_id})
        
        question = db_entry['question']
        choices = db_entry['choices']
        check = db_entry['check']
        answer = db_entry['answer']
        func_name = db_entry['func_name']

        correct_choice = choices[answer]

        full_sol = question + '\n' + correct_choice

        validate_full_sol = CodeGenerationCodeMMLUHelper.check_test_case(
            test_case = test_suite,
            code_snippet = full_sol,
            func_name = func_name
        )

        if validate_full_sol is not True:
            mcq_question_database.find_one_and_delete({"_id": input_task_id})
            failed_to_upload.append(original_id)

    except Exception as e:
        print(original_id, f"failed to upload into the database due to following error: {e}")
    

In [None]:
if len(failed_to_upload) < 1:
    print('All tasks uploaded successfully!')
else:
    print(f"The following tasks failed: {failed_to_upload}")

## Notes: 
* Modified rt00012 answer from B to D. Original answer (B) was incorrect and D is correct.
* Modified rt00052 examples by removing ">>> remove_vowels("abcdef\nghijklm")" example. The \n messes up the process
* rt00067 to rt00164 examples are modified to doctest format