# Turbulence Benchmark Database Builder

The Turbulence benchmark is used to evaluate the robustness of LLMs in code generation tasks. This notebook uses the source code from the benchmark and adapts it for the use case of testing with MuCoCo. It is very computationally expensive to run this locally, so do consider to run it on a Virtual Machine to speed up the database build rate.

In [None]:
import os
import random
from typing import Iterable
from tqdm import tqdm
import sys


In [None]:
curr_dir = os.getcwd()
par_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(par_dir)
sys.path.append(proj_dir)

In [None]:
from utility.helper_functions import TurbulenceBenchmarkHelper
from database import MongoDBHelper
from utility.constants import Seed

# Connecting to MongoDB

In [None]:
db = MongoDBHelper(max_retries= 5)
if db.check_database_connectivity():
    print("MongoDB connected")

base_qns_db = db.client[os.getenv('MONGODB_TURBULENCE_DATABASE')]
baseline_db = base_qns_db[os.getenv('MONGODB_TURBULENCE_COLLECTION')]


In [None]:
curr_dir = os.getcwd()
source_code_dir = os.path.join(curr_dir, "Source_Code (from Turbulence GitHub)")
qn_folders = [f for f in os.listdir(source_code_dir) if os.path.isdir(os.path.join(source_code_dir, f))]

## Processing Turbulence Dataset and storing into MongoDB

In [None]:

# %%script false --no-raise-error

seed = Seed.value
random.seed(seed)

failed_params = {}

for qn_idx in tqdm(range(len(qn_folders))):
    qn_folder_name = qn_folders[qn_idx]
    # setting the correct qn_num
    q_no = qn_folder_name.split("Q")[-1]  

    if q_no != "":

        print(q_no)

        func_name = None            # stores the task function name
        
        # obtaining the folder directory to the target qn. In this benchmark, each question is kept in an individual folder.
        qn_folder_dir = os.path.join(source_code_dir, qn_folder_name)
        
        # initializing a TurbulenceBenchmarkHelper object with the question number and seed
        helper = TurbulenceBenchmarkHelper(q_no= q_no, seed=seed)

        # 1. Generating the params to substitute into the solutions etc. 
        gen_params_res = helper.run_gen_params(
            qn_folder_dir = qn_folder_dir,
        )

        og_params_res = len(gen_params_res)

        # Ensuring that each parameter generated is iterable, else it is converted to a Tuple.
        # This step is necessary as some test functions require an iterable input
        gen_params_res = [(param, ) if not isinstance(param, Iterable) else param for param in gen_params_res]

        # 2. Using the generated params to generate the function inputs.
        input_generator_res = helper.run_input_generator(
            qn_folder_dir = qn_folder_dir,
            gen_params = gen_params_res
        )

        og_gen_res = len(input_generator_res)

        # 3. Obtaining the solution, natural language and test templates
        sol_template = TurbulenceBenchmarkHelper.return_template_contents(
            dir = os.path.join(qn_folder_dir, "solution.py.template")
        )

        prompt_template = TurbulenceBenchmarkHelper.return_template_contents(
            dir = os.path.join(qn_folder_dir, "question.txt.template")
        )

        tests_template = TurbulenceBenchmarkHelper.return_template_contents(
            dir = os.path.join(qn_folder_dir, "tests.py.template")
        )

        # Removing the unnecessary import statements in test template
        tests_template = helper.process_test_cases(test_template=tests_template)

        # iterating through each sample generated for each task
        idx = 0
        while idx < len(input_generator_res):
            # If statement checking if the q_no is 29 and if it has exceeded index 9. Refer to the Turbulence failure documentation for the reason why an exception has to be made for q_no 29.
            if q_no == "29" and idx >=9:
                failed_params[q_no] = og_params_res - idx
                gen_params_res = gen_params_res[:idx]
                input_generator_res = input_generator_res[:idx]
                break
            
            params = gen_params_res[idx]
            func_input = input_generator_res[idx]

            solution = sol_template
            prompt = prompt_template
            tests = tests_template
            for param_idx, param in enumerate(params):
                solution = solution.replace(f"${param_idx}", str(param))
                tests = tests.replace(f"${param_idx}", str(param))
            
            if func_name is None:
                try:
                    func_name = helper.obtain_func_name(
                        sol_template= solution,
                        qn_txt_template= prompt_template
                        )
                except Exception as e:
                    failed_params[q_no] = e
                    continue
            
            
            tests = helper.replace_func_name(tests_template = tests, func_name = func_name)

            try:
                helper.run_test_suite(tests = tests, solution = solution)
            except Exception as e:
                gen_params_res.pop(idx)
                input_generator_res.pop(idx)
                failed_params[q_no] = failed_params.get(q_no, 0) + 1
                continue
            
            try:
                ans = helper.obtain_canon_sol_output(
                    solution = solution,
                    test_input = func_input,
                    func_name=func_name
                )
                
                input_generator_res[idx] = (func_input, ans)

            except Exception as e:
                print(f"Failed to obtain the canon solution output for {q_no}")
                pass

            idx += 1
        
        # If statement checking if any params were failed
        if failed_params.get(q_no, 0) != 0:
            failed_params[q_no] = f"{og_params_res - failed_params[q_no]}/{og_params_res}"

        # Ensuring that Q48, which has no parameters passing, does not get uploaded into the Turbulence database
        if len(input_generator_res) == 0 or len(gen_params_res) == 0:
            continue 
        
        # Parameter and test inputs dictionary
        param_dict = {}
        for idx in range(len(gen_params_res)):
            params_entry = gen_params_res[idx]
            input_entry = input_generator_res[idx][0]
            output_entry = input_generator_res[idx][1]


            param_dict[str(idx)] = {
                "params": helper.process_for_mongo_db_storage(params_entry),
                "func_input" : helper.process_for_mongo_db_storage(input_entry),
                "func_output": helper.process_for_mongo_db_storage(output_entry)
            }
        
        task_id = f"TurbulenceQ{qn_idx + 1}"

        # Creating the database entry
        database_entry = {
            "_id": task_id,
            "question_template": tests_template,
            "prompt_template": prompt_template,
            "solution_template": sol_template,
            "func_name": func_name,
            "params": param_dict,
            "original_id": f"Q{q_no}"
        }
        
        # Storing entry into the baseline database
        exisiting_entry = baseline_db.find_one({"original_id": f"Q{q_no}"})

        try:
            if exisiting_entry is not None:
                baseline_db.find_one_and_replace({"_id": task_id}, database_entry)
            else:
                baseline_db.insert_one(database_entry)
        except Exception as e:
            print(e)
            continue
    

Not all cases of Turbulence pass the check function. This is a list of questions from the original Turbulence database which did not fully pass the test suite. The reason for failure is included in a pdf located in ...

Number of test cases passed for Q8: 93

Number of test cases passed for Q13: 4

Number of test cases passed for Q41: 99

Number of test cases passed for Q48: 0

Number of test cases passed for Q21: 97

Number of test cases passed for Q29: 9

Number of test cases passed for Q42: 6

### Printing all tasks that failed

In [None]:
%%script false --no-raise-error

print("The following tasks failed the following parameters:")
for q_no, count in failed_params.items():
    print(f"    Q{q_no} - Canon solution pass count {count}.")

## Retrieving documents stored in Turbulence datasets and checking test oracle

This is a necessary step as it ensures that the experiment is still replicable and valid even after storing it into MongoDB. Any tests that fails this step should be removed from the MongoDB database as it is deemed an invalid test set.

In [None]:
num_tasks = baseline_db.count_documents({})
seed = Seed.value
tot_tasks = 0

for idx in range(60):

    q_no = idx + 1
    task_id = f"TurbulenceQ{q_no}"

    doc = baseline_db.find_one({"_id": task_id})

    if doc is None:
        print(f"Could not retrieve {task_id} from the dataset.")
        continue

    qn_template = doc["question_template"]
    solution_template = doc["solution_template"]
    func_name = doc["func_name"]
    param_dict: dict = doc["params"]
    original_id: str = doc['original_id']

    original_q_no = original_id.split("Q")[-1]

    for task in param_dict.values():
        params = task['params']
        func_input = task['func_input']
        func_output = task['func_output']
        solution = solution_template
        tests = qn_template

        helper = TurbulenceBenchmarkHelper(q_no= original_q_no, seed = seed)

        processed_params = helper.convert_data_to_metadata(data = params["data"], metadata = params['metadata'])

        for param_idx, processed_param in enumerate(processed_params):
            solution = solution.replace(f"${param_idx}", str(processed_param))
            tests = tests.replace(f"${param_idx}", str(processed_param))

        tests = helper.replace_func_name(tests_template = tests, func_name = func_name)

        try:
            helper.run_test_suite(tests = tests, solution = solution)
        except Exception as e:
            print(tests)
            print(solution)
            print(type(e), e)
            print(f'{task_id} failed the tests when retrieved')
            # baseline_db.find_one_and_delete({"_id": task_id})

        try: 
            processed_func_input = helper.convert_data_to_metadata(func_input['data'], func_input['metadata'])
            processed_func_output = helper.convert_data_to_metadata(func_output['data'], func_output['metadata'])
            helper.verify_prog_answer(
                canonical_sol=solution,
                func_input=processed_func_input,
                func_name = func_name,
                func_output=processed_func_output,
            )

            
        except Exception as e:
            print(solution)
            print(type(e), e)
            print(func_input, "---", func_output)
            print(f'{task_id} canon solution returned a different output')
    print(original_id, len(param_dict))
    tot_tasks += len(param_dict)

print(f"A total of {tot_tasks} is available for testing.")
    