## MongoDB HumanEval Code Generation Database Builder

This notebook is used to initialize a new code generation database onto MongoDB. A custom HumanEval dataset in csv format is processed into a set format through this notebook and stored in your MongoDB cluster.

In [None]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv

In [None]:
# assuming running from this notebook directly, else change the file path below.
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(parent_dir)
humaneval = pd.read_csv(os.path.join(proj_dir, "datasets/open_ended_format/humaneval_test_modified_open.csv"), header = 0, encoding='unicode_escape')

sys.path.append(proj_dir)

In [None]:
from code_generation.utility.humaneval_helper import CodeGenerationHumanEvalHelper
from database import MongoDBHelper

## Connecting to MongoDB

In [None]:
mongodbHelper = MongoDBHelper()
mongodbHelper.check_database_connectivity()

In [None]:
load_dotenv()
db = mongodbHelper.client[os.getenv('MONGODB_BENCHMARK_DATABASE')]
open_ended_db = db[os.getenv('MONGODB_HUMANEVAL_CG_COLLECTION')]

# Running Database Storing Process

In [None]:
to_check_example = []
to_check_test = set()

for idx in range(humaneval.__len__()):

    qn_id = f"HumanEvalo{idx-len(to_check_example)-len(to_check_test)}"

    qn_details = open_ended_db.find_one({"_id" : qn_id})        # checking if this qn_id already exists in the db

    qn = humaneval.iloc[idx]
    prompt = qn['prompt']
    canonical_solution = qn['canonical_solution']
    tests = qn['test']
    original_test_id = qn['task_id']
    func_name = qn['entry_point']

    new_qn, qn_desc = CodeGenerationHumanEvalHelper.seperate_original_desciptions(prompt)

    desc, examples = CodeGenerationHumanEvalHelper.extract_examples(qn_desc)

    ## Rejecting any questions where there are no examples
    if len(examples.keys()) < 1:
        to_check_example.append(qn_id)
        continue

    ## Cleaning the check function
    check_function = CodeGenerationHumanEvalHelper.process_original_tests(tests)

    ## Obtaining the full solution
    full_solution = new_qn + "\n" + canonical_solution

    ## Ensuring that the full solution passes the check function
    code_validation = CodeGenerationHumanEvalHelper.check_test_case(test_case = check_function, code_snippet = full_solution, func_name = func_name)
    
    entry_dict = {
        "_id" : qn_id,
        "qn" : new_qn,
        "canon_solution" : canonical_solution,
        "qn_desc" : desc,
        "examples": examples,
        "check" : check_function,
        "original_id": original_test_id,
        "func_name": func_name
    }
    if code_validation is True:
        if qn_details is None:
            open_ended_db.insert_one(entry_dict)
            print('Added entry to database: {id}'.format(id = qn_id))
        else:
            open_ended_db.update_one({"_id" : qn_id}, update = {"$set": entry_dict})
            print('Updated existing entry in database: {id}'.format(id = qn_id))
    else:
        to_check_test.add(qn_id)

print(to_check_example if len(to_check_example) > 0 else "All cases contains examples. Nothing to check!")
print(to_check_test if len(to_check_test) > 0 else "All test cases passed. Nothing to check!")

### Notes:
- Modified question HumanEval/10 to contain nested functions for uniformity.
- HumanEval/41, HumanEval/38, HumanEval/50 did not have any examples in the questions and were excluded during our experimentation. They can be added in for zero shot prompting.
- HumanEval/66 to HumanEval/163 modified to fit the standard doctest format.