### Imports

In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.getcwd())

import constants
from human_eval_utils import parse_python_code, construct_test_program, code_runs_without_errors, save_results
from generic_agents.CodeInterpreterAgent import CodeInterpreterAgent

### Loading the HumanEval Dataset

In [2]:
from human_eval_utils import load_human_eval

dataset = load_human_eval()

print(len(dataset))
task = dataset[-131]
# print a random example
print(task.keys())
print(task['prompt'])
print(task['entry_point'])
# print(task['test'])

Using the latest cached version of the dataset since openai_humaneval couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'openai_humaneval' at /home/pedro/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/0.0.0/7dce6050a7d6d172f3cc5c32aa97f52fa1a2e544 (last modified on Tue Jul 16 13:00:03 2024).


164
dict_keys(['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'])


def sort_third(l: list):
    """This function takes a list l and returns a list l' such that
    l' is identical to l in the indicies that are not divisible by three, while its values at the indicies that are divisible by three are equal
    to the values of the corresponding indicies of l, but sorted.
    >>> sort_third([1, 2, 3])
    [1, 2, 3]
    >>> sort_third([5, 6, 3, 4, 8, 9, 2])
    [2, 6, 3, 4, 8, 9, 5]
    """

sort_third


### Single Agent Coder

#### CodeInterpreterAgent

In [43]:
system_prompt = """ 
    You are an expert software engineer. You are asked to write code to solve a problem 
    which involves creating a Python method to solve a problem indicated as the comments of the method. 
    The code should be efficient and correct. The code should be written in Python. 
    Store the code solution in a file, and provide the file path as the answer.
    Do not add any assertions to the code, just complete the method. 
    Any library imports should be inside the new method, not at the top of the file.
"""

n_problems = 3
single_agent_output_dir = 'CodeGenerations/single_agent_coder'
single_agent_coder = CodeInterpreterAgent(system_prompt=system_prompt, agent_name="single_agent_coder")
print(f'Number of messages: {single_agent_coder.get_number_messages()}')

n_correct = 0
for n_problem in range(n_problems):
    task = dataset[-n_problem]
    
    solution_file_path = os.path.join(single_agent_output_dir, f'problem_{n_problem}.py')
    test_file_save_path = os.path.join(single_agent_output_dir, f'test_{n_problem}.py')

    response = single_agent_coder.prompt_with_output_file(task['prompt'], file_path=solution_file_path)
    code_runs = single_agent_coder.test_human_eval_solutions(solution_file_path=solution_file_path, test_code=task['test'], 
                                        method_name=task['entry_point'], test_file_save_path=test_file_save_path)
    
    print(f'Number of messages: {single_agent_coder.get_number_messages()}')
    print(f'Problem {n_problem} - Tests Passed: {code_runs}')

    if code_runs:
        n_correct += 1
    single_agent_coder.delete_all_messages()

print(f'Correct: {n_correct}/{n_problems}')

Number of messages: 0
Number of messages: 2
Problem 0 - Tests Passed: True
AssertionError in check_code_execution: Test 1
Number of messages: 2
Problem 1 - Tests Passed: False
Number of messages: 2
Problem 2 - Tests Passed: True
Correct: 2/3


#### MultiTurnLLMAgent

In [None]:
from generic_agents.MultiTurnLLMAgent import MultiTurnLLMAgent

system_prompt = """ 
    You are an expert software engineer. You are asked to write code to solve a problem 
    which involves creating a Python method to solve a problem indicated as the comments of the method. 
    You should only output the completition of the method, not the entire file.
"""

agent = MultiTurnLLMAgent(system_prompt=system_prompt)
n_tasks = len(dataset)
results = {
    "score": 0, 
    "test_counts": {"NoError": 0, "Error": 0, "AssertionError": 0, "IncorrectInput": 0}, 
    "tests_results": {}
}

n_correct = 0
for task_number in range(n_tasks):
    task = dataset[task_number]
    
    response = agent.user_prompt(task['prompt'])
    completion = parse_python_code(response)

    test_path = constants.HUMAN_EVAL_SINGLE_AGENT_DIR + 'test_files/' + f'problem_{task_number}.py'
    test_program = construct_test_program(task['prompt'], completion, task['test'], task['entry_point'], save_path=test_path)
    (code_works, reason) = code_runs_without_errors(file_path=test_path)
    
    print(f'Problem {task_number} - Tests Passed: {code_works}')
    if code_works:
        n_correct += 1
    if reason in results["test_counts"]:
        results["test_counts"][reason] += 1
    else:
        results["test_counts"][reason] = 1
    results["tests_results"][task_number] = (code_works, reason)

    agent.reset_messages()

results["score"] = round(n_correct/n_tasks * 100, 4)
save_results(results, constants.HUMAN_EVAL_SINGLE_AGENT_DIR)
print(f'Correct: {n_correct}/{n_tasks}')
print(results)

### LangChain AgentCoder

In [3]:
from multi_agent_graph import get_multi_agent_summarizer_graph
from IPython.display import Image, display

graph = get_multi_agent_summarizer_graph()
app = graph.compile()
# display(Image(app.get_graph(xray=True).draw_mermaid_png())) 

def agent_coder(task_prompt, max_iterations=3, passed_tests_threshold=0.7):
    config = {"recursion_limit": 50}
    inputs = {
        "incomplete_method": task_prompt,
        "max_iterations": max_iterations,
        "passed_tests_threshold": passed_tests_threshold,
    }

    for event in app.stream(inputs, config=config):
        print("Event:", event)

    
    last_event_name = list(event.keys())[0]
    return event[last_event_name]['completed_method']




Initializing CodeInterpreterAgent with thread id:  Thread(id='thread_2zEQgxZ3YWD0wgsNxeMw4v4t', created_at=1722098890, metadata={}, object='thread', tool_resources=ToolResources(code_interpreter=None, file_search=None)) 





In [7]:
n_tasks = len(dataset)
passed_tests_threshold = 0.7
max_iterations = 3

results = {
    "score": 0, 
    "test_counts": {"NoError": 0, "Error": 0, "AssertionError": 0, "IncorrectInput": 0}, 
    "tests_results": {}
}

n_correct = 0
for task_number in range(n_tasks):
    task = dataset[task_number]
    
    completion = agent_coder(task['prompt'], max_iterations=max_iterations, passed_tests_threshold=passed_tests_threshold)

    test_path = constants.HUMAN_EVAL_AGENT_CODER_DIR + 'test_files/' + f'problem_{task_number}.py'
    test_program = construct_test_program(task['prompt'], completion, task['test'], task['entry_point'], save_path=test_path)
    (code_works, reason) = code_runs_without_errors(file_path=test_path)
    
    print(f'Problem {task_number} - Tests Passed: {code_works}' + f' - Reason: {reason if not code_works else ""}')
    if code_works:
        n_correct += 1
    if reason in results["test_counts"]:
        results["test_counts"][reason] += 1
    else:
        results["test_counts"][reason] = 1
    results["tests_results"][task_number] = (code_works, reason)

results["score"] = round(n_correct/n_tasks * 100, 4)
save_results(results, constants.HUMAN_EVAL_AGENT_CODER_DIR)
print(f'Correct: {n_correct}/{n_tasks}')
print(results)

In AgentTestGenerator
Event: {'Test Generator': {'incomplete_method': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n', 'generated_tests': '\nassert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, "Test Case 1: No two elements are within 0.5 units of each other."\nassert has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True, "Test Case 2: 2.8 and 2.0 are within 0.3 units."\nassert has_close_elements([1.0, 1.4, 1.3, 2.0], 0.5) == True, "Test Case 3: 1.4 and 1.3 are within 0.5 units."\nassert has_close_elements([10.0, 10.3, 9.5], 0.4) == True, "Test Case 4: 10.0 and 10.3 are within 0.4 units."\nassert has_close_elements([], 0.5) == False, "Test Case 5: Empty list shou

  improvement from the last ten iterations.
  root = fsolve(polynomial_function, initial_guess)


Event: {'Test Generator': {'incomplete_method': '\n\ndef sort_third(l: list):\n    """This function takes a list l and returns a list l\' such that\n    l\' is identical to l in the indicies that are not divisible by three, while its values at the indicies that are divisible by three are equal\n    to the values of the corresponding indicies of l, but sorted.\n    >>> sort_third([1, 2, 3])\n    [1, 2, 3]\n    >>> sort_third([5, 6, 3, 4, 8, 9, 2])\n    [2, 6, 3, 4, 8, 9, 5]\n    """\n', 'generated_tests': '\nassert sort_third([1, 2, 3]) == [1, 2, 3], "Test Case 1: No elements at indices divisible by 3"\nassert sort_third([5, 6, 3, 4, 8, 9, 2]) == [2, 6, 3, 4, 8, 9, 5], "Test Case 2: Sort elements at indices divisible by 3"\nassert sort_third([10, 20, 30, 40, 50, 60, 70, 80, 90]) == [70, 20, 30, 40, 50, 60, 10, 80, 90], "Test Case 3: Multiple elements with sorting at index 0 and 6"\nassert sort_third([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) == [7, 2, 3, 4, 5, 6, 1, 8, 9, 10], "Test Case 4: Sorti

In [6]:
from llm_agents.AgentTestExecutor import AgentTestExecutor
from data_classes.AgentCoderState import AgentCoderState
state = AgentCoderState(completed_method='def sum_x_y():\n    return x + y\n', generated_tests='assert sum_x_y(1, 2) == -3', passed_tests_threshold=0.7, max_iterations=3, current_iterations=0)


for i in range(5):
    test_executor = AgentTestExecutor()
    macm_state = test_executor(state)
    test_executor.delete_all_messages()

In AgentTestExecutor, with thread id:  Thread(id='thread_B9l9Ksth8fkj0yttxelVHWrq', created_at=1722098902, metadata={}, object='thread', tool_resources=ToolResources(code_interpreter=None, file_search=None))
Current iterations: 1
Tests passed: False. Proportion tests passed: 0.0
Feedback: 
The code fails the test. The assertion `assert sum_x_y(1, 2) == -3` is not satisfied, as the actual output of `sum_x_y(1, 2)` is `3`.
In AgentTestExecutor, with thread id:  Thread(id='thread_B9l9Ksth8fkj0yttxelVHWrq', created_at=1722098902, metadata={}, object='thread', tool_resources=ToolResources(code_interpreter=None, file_search=None))
Current iterations: 2
Tests passed: False. Proportion tests passed: 0.0
Feedback: 
The assertion `assert sum_x_y(1, 2) == -3` is not satisfied, as the actual output of `sum_x_y(1, 2)` is `3`.
In AgentTestExecutor, with thread id:  Thread(id='thread_B9l9Ksth8fkj0yttxelVHWrq', created_at=1722098902, metadata={}, object='thread', tool_resources=ToolResources(code_inte