### Imports

In [1]:
!pip3 install pandas
!pip3 install pyarrow
!pip3 install fsspec
!pip3 install huggingface_hub



In [2]:
import pandas as pd
import ast
import traceback
from dotenv import load_dotenv
import openai
import os
import random
from collections import defaultdict


### Differential Testing

#### Note: only generate inputs once for each test case
output probability distributions for each program: e.g. probability dif of gen prog to ref prog for each one

In [3]:
def string_to_function(func_string):
    """
    Convert a function string to an executable function
    """
    try:
        func_string = func_string.strip().removeprefix("```python").removeprefix("```").strip("`").strip()
        local_scope = {}
        exec(func_string, {}, local_scope)
        func_name = next(iter(local_scope))  # Assuming there's only one function in the string
        return local_scope[func_name]
    except Exception as e:
        print(f"Error creating function from string: {e}")
        return None

In [4]:
def parse_test_case(test_case_str):
    """
    Parse test cases from assertion strings
    """
    try:
        # Remove 'assert' and split function call from expected result
        test_case_str = test_case_str.replace('assert ', '').strip()
        
        # Find the function call and expected result
        call_end = test_case_str.rfind('==')
        func_call = test_case_str[:call_end].strip()
        expected_result = ast.literal_eval(test_case_str[call_end+2:].strip())

        
        # Parse the function call to extract function name and arguments
        func_call_parts = func_call.split('(', 1)
        func_name = func_call_parts[0].strip()
        # Parse arguments
        args_str = func_call_parts[1].rstrip(')').strip()
        # Use ast to safely evaluate arguments
        args = ast.literal_eval(f'[{args_str}]')
        
        return {
            'function_name': func_name,
            'args': args,
            'expected_result': expected_result
        }
    except Exception as e:
        print(f"Error parsing test case {test_case_str}: {e}")
        return None

In [5]:
def differential_tester(reference_program, generated_programs, test_cases):
    """
    Differential tester with specific test cases
    
    Parameters:
    - reference_program: Reference implementation as a string
    - generated_programs: List of candidate implementations as strings
    - test_cases: List of test case assertion strings
    """
    # Convert reference function
    reference_func = string_to_function(reference_program)
    
    if reference_func is None:
        print("Invalid reference function")
        return None
    
    # Store test results
    test_results = {}
    
    # Parse test cases
    parsed_test_cases = []
    for test_case_str in test_cases:
        parsed_case = parse_test_case(test_case_str)
        if parsed_case:
            parsed_test_cases.append(parsed_case)
    
    # Test each generated program against the reference
    for i, program_str in enumerate(generated_programs, 1):
        candidate_func = string_to_function(program_str)
        
        if candidate_func is None:
            print(f"Program {i} is invalid. Skipping.")
            test_results[f"Program {i}"] = "Invalid Implementation"
            continue
        
        # Track if this program passes all tests
        program_passes = True
        failed_tests = []
        
        # Run tests
        for test_case in parsed_test_cases:
            try:
                # Get function arguments from parsed test case
                args = test_case['args']
                
                # Run reference and candidate functions
                ref_output = reference_func(*args)
                cand_output = candidate_func(*args)
                
                # Check if outputs match
                if ref_output != cand_output:
                    print(f"Mismatch for inputs {args}:")
                    print(f"Reference output: {ref_output}")
                    print(f"Candidate output: {cand_output}")
                    program_passes = False
                    failed_tests.append({
                        'inputs': args,
                        'reference_output': ref_output,
                        'candidate_output': cand_output
                    })
            
            except Exception as e:
                print(f"Error testing Program {i} with inputs {args}:")
                print(traceback.format_exc())
                program_passes = False
                failed_tests.append({
                    'inputs': args,
                    'error': str(e)
                })
        
        # Record test result
        test_results[f"Program {i}"] = {
            'passes': program_passes,
            'failed_tests': failed_tests
        }
    
    return test_results

In [6]:
def run_differential_testing(reference_program, generated_programs, test_cases):
    # Run differential testing
    results = differential_tester(reference_program, generated_programs, test_cases)
    print("\nTest Results:")
    for program, result in results.items():
        print(f"{program}: {'Passes' if result['passes'] else 'Fails'}")
        if not result['passes']:
            print("Failed Tests:")
            for failed_test in result['failed_tests']:
                print(f"  Inputs: {failed_test.get('inputs', 'N/A')}")
                if 'error' in failed_test:
                    print(f"  Error: {failed_test['error']}")
                else:
                    print(f"  Reference Output: {failed_test['reference_output']}")
                    print(f"  Candidate Output: {failed_test['candidate_output']}")

##### Testing

In [7]:
# Example usage
reference_program = """
def zip_list(lists1, lists2):
    return [list1 + list2 for list1, list2 in zip(lists1, lists2)]
"""

generated_programs = [
    """
def zip_list(lists1, lists2):
    return [l1 + l2 for l1, l2 in zip(lists1, lists2)]
    """,
    """
def zip_list(lists1, lists2):
    result = []
    for list1, list2 in zip(lists1, lists2):
        result.append(list1 + list2)
    return result
    """
]

# Sample test cases from the dataset
test_cases = [
    "assert zip_list([[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]) == [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]]"
]

run_differential_testing(reference_program, generated_programs, test_cases)


Test Results:
Program 1: Passes
Program 2: Passes


In [8]:
# Example usage
reference_program = """
def zip_list(lists1, lists2):
    return [list1 + list2 for list1, list2 in zip(lists1, lists2)]
"""

generated_programs = [
    """
def zip_list(lists1, lists2):
    return [l1 + l1 + l2 for l1, l2 in zip(lists1, lists2)]
    """,
    """
def zip_list(lists1, lists2):
    result = []
    for list1, list2 in zip(lists1, lists2):
        result.append(list1 + list2)
    return result
    """
]

# Sample test cases from the dataset
test_cases = [
    "assert zip_list([[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]) == [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]]"
]

run_differential_testing(reference_program, generated_programs, test_cases)

Mismatch for inputs [[[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]]:
Reference output: [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]]
Candidate output: [[1, 3, 1, 3, 2, 4], [5, 7, 5, 7, 6, 8], [9, 11, 9, 11, 10, 12, 14]]

Test Results:
Program 1: Fails
Failed Tests:
  Inputs: [[[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]]
  Reference Output: [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]]
  Candidate Output: [[1, 3, 1, 3, 2, 4], [5, 7, 5, 7, 6, 8], [9, 11, 9, 11, 10, 12, 14]]
Program 2: Passes


### MUS Loop

In [18]:
from prompting import prompt_code_generation, prompt_code_generation_artificial_entropy, prompt_requirement_repair


class MUSAccuracyEvaluator:
    def __init__(self, openai_api_key, differential_tester, model="gpt-4"):
        self.openai_api_key = openai_api_key
        self.differential_tester = differential_tester
        self.model = model
        
        # Initialize result tracking
        self.total_runs = 0
        self.successful_runs = 0
        self.run_details = []

    def generate_programs(self, requirements):
        try:
            print("GENERATE PROGRAMS NO ENTROPY")
            response = openai.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an assistant that generates Python code based on specifications."},
                    {"role": "user", "content": prompt_code_generation(requirements)}
                ],
                temperature=0.7,
                max_tokens=300
            )
            generated_program = response.choices[0].message.content
            return generated_program.strip()
        except openai.OpenAIError as e:
            print(f"Error interacting with OpenAI API: {e}")
            return []
        
    def generate_programs_artificial_entropy(self, requirements, previously_generated_programs):
        try:
            print("GENERATE PROGRAMS WITH ENTROPY")
            response = openai.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an assistant that generates Python code based on specifications."},
                    {"role": "user", "content": prompt_code_generation_artificial_entropy(requirements, previously_generated_programs)}
                ],
                temperature=0.7,
                max_tokens=300
            )
            generated_program = response.choices[0].message.content
            return generated_program.strip()
        except openai.OpenAIError as e:
            print(f"Error interacting with OpenAI API: {e}")
            return []

    def refine_requirements(self, requirements, counterexample):
        try:
            response = openai.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an assistant who reads code specifications and repairs them."},
                    {"role": "user", "content": prompt_requirement_repair(requirements, counterexample)}
                ],
                temperature=0.7,
                max_tokens=300
            )
            refined_requirements = response.choices[0].message.content
            return [refined_requirements.strip()]
        except openai.OpenAIError as e:
            print(f"Error interacting with OpenAI API: {e}")
            return []

    def compute_mus(self, program, initial_requirements, tests, task_id, N, max_iterations=10, entropy=False):
        self.total_runs += 1
        requirements = initial_requirements
        try:
            for iteration in range(max_iterations):
                print("REQUIREMENTS:")
                print(requirements)
                # Generate programs (currently set to N=1 for testing speed)
                print(f"GENERATED PROGRAMS FOR ITERATION {iteration}:")
                generated_programs = []
                for i in range(N):
                    if (entropy):
                        generated_programs.append(self.generate_programs_artificial_entropy(requirements, generated_programs))
                    else:
                        generated_programs.append(self.generate_programs(requirements))
                    print(generated_programs[i])
                
                
                # Check for inconsistencies
                inconsistencies = self.differential_tester(program, generated_programs, tests)
                print("INCONSISTENCIES: ", inconsistencies)
                print("CONDITION: ", not any(info['failed_tests'] for info in inconsistencies.values()))
                if not (any(info['failed_tests'] for info in inconsistencies.values())):
                    self.successful_runs += 1
                    self.run_details.append({
                        'task_id': task_id,
                        'initial_requirements': initial_requirements,
                        'iterations_to_success': iteration + 1,
                        'success': True
                    })
                    print("NO FAILED TESTS: SUCCESS")
                    return requirements
                else:
                    # Identify a counterexample
                    # counterexample = random.choice(inconsistencies.values())
                    counterexample = random.choice(inconsistencies['Program 1']['failed_tests']) if not inconsistencies['Program 1']['passes'] else None
                    print("COUNTEREXAMPLE:", counterexample)
                    requirements = self.refine_requirements(requirements, counterexample)


            # If max iterations reached
            self.run_details.append({
                'task_id': task_id,
                'initial_requirements': initial_requirements,
                'iterations_to_success': max_iterations,
                'success': False
            })
            return requirements
        except Exception as e:
            print('EXCEPTION THROWN: ', e)
            # If max iterations reached
            self.run_details.append({
                'task_id': task_id,
                'initial_requirements': initial_requirements,
                'iterations_to_success': max_iterations,
                'success': False
            })
            return requirements

    def calculate_accuracy(self):
        """Calculate and print accuracy metrics"""
        accuracy = self.successful_runs / self.total_runs if self.total_runs > 0 else 0
        
        print("\n--- MUS Computation Accuracy ---")
        print(f"Total Runs: {self.total_runs}")
        print(f"Successful Runs: {self.successful_runs}")
        print(f"Accuracy: {accuracy:.2%}")
        
        # Convert run details to DataFrame for further analysis
        df = pd.DataFrame(self.run_details)
        
        # Additional insights
        if not df.empty:
            print("\nAdditional Insights:")
            print("Success Rate by Iterations:")
            iterations_success = df.groupby('iterations_to_success')['success'].mean()
            print(iterations_success)
        
        return accuracy, df
    
    def get_probability(self):
        return self.successful_runs / self.total_runs if self.total_runs > 0 else 0
    
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

mus_accuracy_evaluator = MUSAccuracyEvaluator(
    openai_api_key=openai_api_key,
    differential_tester=differential_tester,
    model="gpt-4"
)

### Dataset config

In [31]:
splits = {'train': 'full/train-00000-of-00001.parquet', 'test': 'full/test-00000-of-00001.parquet', 'validation': 'full/validation-00000-of-00001.parquet', 'prompt': 'full/prompt-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/google-research-datasets/mbpp/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
print(df.head())

   task_id                                               text  \
0      601  Write a function to find the longest chain whi...   
1      602  Write a python function to find the first repe...   
2      603  Write a function to get a lucid number smaller...   
3      604  Write a function to reverse words in a given s...   
4      605  Write a function to check if the given integer...   

                                                code  \
0  class Pair(object): \r\n\tdef __init__(self, a...   
1  def first_repeated_char(str1):\r\n  for index,...   
2  def get_ludic(n):\r\n\tludics = []\r\n\tfor i ...   
3  def reverse_words(s):\r\n        return ' '.jo...   
4  def prime_num(num):\r\n  if num >=1:\r\n   for...   

                                           test_list test_setup_code  \
0  [assert max_chain_length([Pair(5, 24), Pair(15...                   
1  [assert first_repeated_char("abcabc") == "a", ...                   
2  [assert get_ludic(10) == [1, 2, 3, 5, 7], asse...    

In [40]:
!pip3 install openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet
# os.kill(os.getpid(), 9)

Trivial case: The LLM solves it in the first iteration and we break.

In [None]:
requirements = "Write a Python function to add two numbers together."

reference_program = """
def add(a: int, b: int) -> int:
    return a + b
"""

test_cases = [
    "assert add(1, 2) == 3",
    "assert add(3, 10) == 13",
    "assert add(-5, 5) == 0"
]

print(mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 3, 3))


REQUIREMENTS:
Write a Python function to add two numbers together.
GENERATED PROGRAMS FOR ITERATION 0:
def add_two_numbers(num1, num2):
    return num1 + num2
def add_two_numbers(num1, num2):
    return num1 + num2
def add_two_numbers(num1, num2):
    return num1 + num2
INCONSISTENCIES:  {'Program 1': {'passes': True, 'failed_tests': []}, 'Program 2': {'passes': True, 'failed_tests': []}, 'Program 3': {'passes': True, 'failed_tests': []}}
CONDITION:  True
NO FAILED TESTS: SUCCESS
Write a Python function to add two numbers together.


In [57]:
requirements = "Write a function to reverse words in a given string."

reference_program = """
def reverse_words(s): 
    return ' '.join(reversed(s.split()))
"""

test_cases = [ 
    "assert reverse_words(\"python program\")==(\"program python\")", 
    "assert reverse_words(\"java language\")==(\"language java\")", 
    "assert reverse_words(\"indian man\")==(\"man indian\")" 
]

print(mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 10, 3))

REQUIREMENTS:
Write a function to reverse words in a given string.
GENERATED PROGRAMS FOR ITERATION 0:
def reverse_words(s):
    return ' '.join(s.split()[::-1])
def reverse_words(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words(s: str) -> str:
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words(s: str) -> str:
    return ' '.join(s.split()[::-1])
def reverse_words(s):
    return ' '.join(s.split()[::-1])
INCONSISTENCIES:  {'Program 1': {'passes': True, 'failed_tests': []}, 'Program 2': {'passes': True, 'failed_tests': []}, 'Program 3': {'passes': True, 'failed_tests': []}, 'Program 4': {'passes': True, 'failed_tests': []}, 'Program 5

We see above that despite being an ambiguous problem, our LLM has 0 entropy despite N=10.

```
def reverse_words(s):
    return ' '.join(s.split()[::-1])
def reverse_words(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words(s: str) -> str:
    return ' '.join(s.split()[::-1])
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
def reverse_words(s: str) -> str:
    return ' '.join(s.split()[::-1])
def reverse_words(s):
    return ' '.join(s.split()[::-1])
```

This is an issue. Generating 10 different answers for an ambiuguous problem is not useful if they are all the same, so I have tried passing the previously generated code into the prompt and asking it to generate using something different.

In [15]:
requirements = "Write a function to reverse words in a given string."

reference_program = """
def reverse_words(s): 
    return ' '.join(reversed(s.split()))
"""

test_cases = [ 
    "assert reverse_words(\"python program\")==(\"program python\")", 
    "assert reverse_words(\"java language\")==(\"language java\")", 
    "assert reverse_words(\"indian man\")==(\"man indian\")" 
]

print(mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 10, 3, True))

REQUIREMENTS:
Write a function to reverse words in a given string.
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGEAMS WITH ENTROPY
```python
def reverse_words(input_string):
    return ' '.join(input_string.split()[::-1])
```
GENERATE PROGEAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = []
    for word in words:
        reversed_words.insert(0, word)
    return ' '.join(reversed_words)
```
GENERATE PROGEAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = [''] * len(words)
    for i in range(len(words)):
        reversed_words[-(i+1)] = words[i]
    return ' '.join(reversed_words)
```
GENERATE PROGEAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    return ' '.join(words[i] for i in range(len(words)-1, -1, -1))
```
GENERATE PROGEAMS WITH ENTROPY
```python
def reverse_words(input_string):
    stack = []
  

steps:
1. Summarize program precisely
2. check difference between program summary and initial requirements


This method clearly works better – adding the previous generate programs in to the prompt. The prompt I tried was:
```
Here are some requirements:\n{requirements}
Generate a program that adheres to these requirements. 
Please do not generate the exact same code as any of the programs here: {previously_generated_programs}.
Do not base your logic on anything besides the requirements given, including previously generated programs.
Do not output any text at all besides code. I do NOT want an explanation nor a preamble. Just code.
The first line of your response should be the function signature."""
```

The result, with N = 10:

GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    return ' '.join(input_string.split()[::-1])
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = []
    for word in words:
        reversed_words.insert(0, word)
    return ' '.join(reversed_words)
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = [''] * len(words)
    for i in range(len(words)):
        reversed_words[-(i+1)] = words[i]
    return ' '.join(reversed_words)
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    return ' '.join(words[i] for i in range(len(words)-1, -1, -1))
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    stack = []
    word = ''
    for char in input_string:
        if char == ' ':
            stack.append(word)
            word = ''
        else:
            word += char
    if word:
        stack.append(word)
    reversed_string = ''
    while stack:
        reversed_string += stack.pop() + ' '
    return reversed_string.rstrip()
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = list(map(lambda x: words[-(x+1)], range(len(words))))
    return ' '.join(reversed_words)
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    reversed_words = input_string.split(' ')
    start, end = 0, len(reversed_words) - 1
    while start < end:
        reversed_words[start], reversed_words[end] = reversed_words[end], reversed_words[start]
        start, end = start + 1, end - 1
    return ' '.join(reversed_words)
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    word_list = input_string.split(' ')
    reversed_word_list = [word_list.pop() for _ in range(len(word_list))]
    return ' '.join(reversed_word_list)
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = ' '.join(words[len(words) - i - 1] for i in range(len(words)))
    return reversed_words
```
GENERATE PROGRAMS WITH ENTROPY
```python
def reverse_words(input_string):
    words = input_string.split(' ')
    reversed_words = ' '.join(list(reversed(words)))
    return reversed_words
```
INCONSISTENCIES:  {'Program 1': {'passes': True, 'failed_tests': []}, 'Program 2': {'passes': True, 'failed_tests': []}, 'Program 3': {'passes': True, 'failed_tests': []}, 'Program 4': {'passes': True, 'failed_tests': []}, 'Program 5': {'passes': True, 'failed_tests': []}, 'Program 6': {'passes': True, 'failed_tests': []}, 'Program 7': {'passes': True, 'failed_tests': []}, 'Program 8': {'passes': True, 'failed_tests': []}, 'Program 9': {'passes': True, 'failed_tests': []}, 'Program 10': {'passes': True, 'failed_tests': []}}
CONDITION:  True
NO FAILED TESTS: SUCCESS
Write a function to reverse words in a given string.
```

Only reason we may not want to do this is that we give the LLM some extra information about previous ways the code was written, instead of just the requirements. This could skew how ambiguous the requirements seem, but I've tried to account for this in the prompt engineering

In [None]:
requirements = "Write a function to reverse words in a given string."

reference_program = """
def reverse_words(s): 
    return ' '.join(reversed(s.split()))
"""

test_cases = [ 
    "assert reverse_words(\"python program\")==(\"program python\")", 
    "assert reverse_words(\"java language\")==(\"language java\")", 
    "assert reverse_words(\"indian man\")==(\"man indian\")" 
]

mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 10, 3)
mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 10, 3, True)

REQUIREMENTS:
Write a function to reverse words in a given string.
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s: str) -> str:
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(input_string):
    return ' '.join(input_string.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words(s: str) -> str:
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    words = s.split(' ')
    reversed_words = ' '.join(words[::-1])
    return reversed_words
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(input_string: str) -> str:
    return ' '.join(input_string.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words(s):
    return ' '.join(s.split()[::-

As we can see adding entropy makes more mistakes, but this is the desired outcome if our prompt is ambiguous.

In comparison to no added entropy where we luckily have the correct result first, but calculating N times is useless and essentially N = 1.

In [21]:
requirements = "Write a function to reverse words in a given string."

reference_program = """
def reverse_words(s): 
    return ' '.join(reversed(s.split()))
"""

test_cases = [ 
    "assert reverse_words(\"python program\")==(\"program python\")", 
    "assert reverse_words(\"java language\")==(\"language java\")", 
    "assert reverse_words(\"indian man\")==(\"man indian\")" 
]

mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 10, 3)
pwe = mus_accuracy_evaluator.get_probability()
mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 10, 3, True)

print("probability without entropy: ", pwe)
print("probability with entropy: ", mus_accuracy_evaluator.get_probability());

REQUIREMENTS:
Write a function to reverse words in a given string.
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s: str) -> str:
    words = s.split()
    reversed_words = ' '.join(reversed(words))
    return reversed_words
GENERATE PROGRAMS NO ENTROPY
def reverse_words(s: str) -> str:
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words(s):
    return ' '.join(s.split()[::-1])
GENERATE PROGRAMS NO ENTROPY
def reverse_words_in_string(s: s

Now look into "Implement a function that sorts a given array while removing consecutive duplicates"

This is a particularly difficult case for GPT to figure out – it often forgets the consecutive part

In [28]:

requirements = "Implement a function that sorts a given array while removing consecutive duplicates"

reference_program = """
def sort_remove_duplicates(arr):
    if len(arr) == 0:
        return arr
    else:
        new_arr = [arr[0]]
        for i in range(1, len(arr)):
            if arr[i] != new_arr[-1]:
                new_arr.append(arr[i])
                
        new_arr.sort()
        return new_arr
"""

test_cases = [
    "assert sort_remove_duplicates([5, 2, 3, 3]) == [2, 3, 5]",
    "assert sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]"
    "assert sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]"
]

# no added entropy
print(mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 3, 5))


REQUIREMENTS:
Implement a function that sorts a given array while removing consecutive duplicates
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS NO ENTROPY
def sort_remove_consecutive_duplicates(arr):
    return [key for key, _ in groupby(sorted(arr))]
GENERATE PROGRAMS NO ENTROPY
def sort_remove_duplicates(arr):
    return sorted(set(arr), key=arr.index)
GENERATE PROGRAMS NO ENTROPY
def sort_remove_consecutive_duplicates(arr):
    return [v for i, v in enumerate(sorted(arr)) if i == 0 or v != arr[i-1]]
Error parsing test case sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]: closing parenthesis ')' does not match opening parenthesis '[' (<unknown>, line 1)
Error testing Program 1 with inputs [[5, 2, 3, 3]]:
Traceback (most recent call last):
  File "/var/folders/3s/r6p4g2012bb3t1dp2c_gm3rm0000gn/T/ipykernel_11610/3104096123.py", line 48, in differential_tester
    cand_output = candidate_func(*args)
                  ^^^^^

Solved after 2 iterations with no added entropy as seen above

```
REQUIREMENTS:
Implement a function that sorts a given array while removing consecutive duplicates
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS NO ENTROPY
def sort_remove_consecutive_duplicates(arr):
    return [key for key, _ in groupby(sorted(arr))]
GENERATE PROGRAMS NO ENTROPY
def sort_remove_duplicates(arr):
    return sorted(set(arr), key=arr.index)
GENERATE PROGRAMS NO ENTROPY
def sort_remove_consecutive_duplicates(arr):
    return [v for i, v in enumerate(sorted(arr)) if i == 0 or v != arr[i-1]]
Error parsing test case sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]: closing parenthesis ')' does not match opening parenthesis '[' (<unknown>, line 1)
Error testing Program 1 with inputs [[5, 2, 3, 3]]:
Traceback (most recent call last):
  File "/var/folders/3s/r6p4g2012bb3t1dp2c_gm3rm0000gn/T/ipykernel_11610/3104096123.py", line 48, in differential_tester
    cand_output = candidate_func(*args)
                  ^^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 2, in sort_remove_consecutive_duplicates
NameError: name 'groupby' is not defined

Mismatch for inputs [[5, 2, 3, 3]]:
Reference output: [2, 3, 5]
Candidate output: [5, 2, 3]
Mismatch for inputs [[5, 2, 3, 3]]:
Reference output: [2, 3, 5]
Candidate output: [2, 3, 3, 5]
INCONSISTENCIES:  {'Program 1': {'passes': False, 'failed_tests': [{'inputs': [[5, 2, 3, 3]], 'error': "name 'groupby' is not defined"}]}, 'Program 2': {'passes': False, 'failed_tests': [{'inputs': [[5, 2, 3, 3]], 'reference_output': [2, 3, 5], 'candidate_output': [5, 2, 3]}]}, 'Program 3': {'passes': False, 'failed_tests': [{'inputs': [[5, 2, 3, 3]], 'reference_output': [2, 3, 5], 'candidate_output': [2, 3, 3, 5]}]}}
CONDITION:  False
COUNTEREXAMPLE: {'inputs': [[5, 2, 3, 3]], 'error': "name 'groupby' is not defined"}
REQUIREMENTS:
["Implement a function that sorts a given array and removes consecutive duplicate values. The function takes an array as an input, sorts it in ascending order, and in the process, eliminates any repetitive values that are sequentially identical. It is important to note that the function should be developed in a way that it does not rely on the 'groupby' function, as it may not be defined in all contexts. The function should be capable of handling an array of integers, as demonstrated by the example input: [5, 2, 3, 3]. The expected output for this input would be a sorted array with duplicate consecutive numbers removed: [2, 3, 5]."]
GENERATED PROGRAMS FOR ITERATION 1:
GENERATE PROGRAMS NO ENTROPY
def sort_and_remove_duplicates(array):
    array.sort()
    return [array[i] for i in range(len(array)) if i == 0 or array[i] != array[i-1]]
GENERATE PROGRAMS NO ENTROPY
def sort_and_remove_duplicates(arr):
    arr = sorted(set(arr))
    return arr
GENERATE PROGRAMS NO ENTROPY
def sort_and_remove_duplicates(arr):
    arr = sorted(set(arr))
    return [arr[i] for i in range(len(arr)) if i == 0 or arr[i] != arr[i - 1]]
Error parsing test case sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]: closing parenthesis ')' does not match opening parenthesis '[' (<unknown>, line 1)
INCONSISTENCIES:  {'Program 1': {'passes': True, 'failed_tests': []}, 'Program 2': {'passes': True, 'failed_tests': []}, 'Program 3': {'passes': True, 'failed_tests': []}}
CONDITION:  True
NO FAILED TESTS: SUCCESS
["Implement a function that sorts a given array and removes consecutive duplicate values. The function takes an array as an input, sorts it in ascending order, and in the process, eliminates any repetitive values that are sequentially identical. It is important to note that the function should be developed in a way that it does not rely on the 'groupby' function, as it may not be defined in all contexts. The function should be capable of handling an array of integers, as demonstrated by the example input: [5, 2, 3, 3]. The expected output for this input would be a sorted array with duplicate consecutive numbers removed: [2, 3, 5]."]
```

In [29]:

requirements = "Implement a function that sorts a given array while removing consecutive duplicates"

reference_program = """
def sort_remove_duplicates(arr):
    if len(arr) == 0:
        return arr
    else:
        new_arr = [arr[0]]
        for i in range(1, len(arr)):
            if arr[i] != new_arr[-1]:
                new_arr.append(arr[i])
                
        new_arr.sort()
        return new_arr
"""

test_cases = [
    "assert sort_remove_duplicates([5, 2, 3, 3]) == [2, 3, 5]",
    "assert sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]"
    "assert sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]"
]

#added entropy
print(mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 0, 3, 5, True))


REQUIREMENTS:
Implement a function that sorts a given array while removing consecutive duplicates
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS WITH ENTROPY
```python
def sort_and_remove_consecutives(arr):
    return [key for key, group in groupby(sorted(arr))]

from itertools import groupby
```
GENERATE PROGRAMS WITH ENTROPY
```python
def sort_and_remove_consecutives(arr):
    sorted_arr = sorted(arr)
    result = [sorted_arr[i] for i in range(len(sorted_arr)) if i == 0 or sorted_arr[i] != sorted_arr[i-1]]
    return result
```
GENERATE PROGRAMS WITH ENTROPY
```python
def sort_and_remove_consecutives(arr):
    result = []
    arr.sort()
    for i in arr:
        if len(result) == 0 or result[-1] != i:
            result.append(i)
    return result
```
Error parsing test case sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]: closing parenthesis ')' does not match opening parenthesis '[' (<unknown>, line 1)
Error testing Pr

With added entropy our algorithm also figures out the problem (with some variance, sometimes 1 iteration sometimes 0):

REQUIREMENTS:
Implement a function that sorts a given array while removing consecutive duplicates
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS WITH ENTROPY
```python
def sort_remove_duplicates(arr):
    return [x for i, x in enumerate(sorted(set(arr))) if i == 0 or x != sorted(set(arr))[i-1]]
```
GENERATE PROGRAMS WITH ENTROPY
```python
def sort_remove_consecutive_duplicates(arr):
    if not arr:
        return []

    sorted_arr = sorted(arr)
    result = [sorted_arr[0]]

    for i in range(1, len(sorted_arr)):
        if sorted_arr[i] != sorted_arr[i-1]:
            result.append(sorted_arr[i])

    return result
```
GENERATE PROGRAMS WITH ENTROPY
```python
def sort_and_remove_duplicates(arr):
    result = []
    arr.sort()
    for i in arr:
        if not result or result[-1] != i:
            result.append(i)
    return result
```
Error parsing test case sort_remove_duplicates([5, 2, 3, 2, 3]) == [2, 2, 3, 3, 5]sort_remove_duplicates([1, 2, 1]) == [1, 1, 2]: closing parenthesis ')' does not match opening parenthesis '[' (<unknown>, line 1)
INCONSISTENCIES:  {'Program 1': {'passes': True, 'failed_tests': []}, 'Program 2': {'passes': True, 'failed_tests': []}, 'Program 3': {'passes': True, 'failed_tests': []}}
CONDITION:  True
NO FAILED TESTS: SUCCESS
Implement a function that sorts a given array while removing consecutive duplicates


In [None]:
# Example usage
requirements = """Write a function to zip two given lists of lists."""

reference_program = """
def zip_list(lists1, lists2):
    return [list1 + list2 for list1, list2 in zip(lists1, lists2)]
"""

generated_programs = [
    """
def zip_list(lists1, lists2):
    return [l1 + l2 for l1, l2 in zip(lists1, lists2)]
    """,
    """
def zip_list(lists1, lists2):
    result = []
    for list1, list2 in zip(lists1, lists2):
        result.append(list1 + list2)
    return result
    """
]

# Sample test cases from the dataset
test_cases = [
    "assert zip_list([[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]) == [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]]"
]

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

mus_accuracy_evaluator = MUSAccuracyEvaluator(
    openai_api_key=openai_api_key,
    differential_tester=differential_tester,
    model="gpt-4"
)

print(mus_accuracy_evaluator.compute_mus(reference_program, requirements, test_cases, 3, 2))

REQUIREMENTS:
Write a function to zip two given lists of lists.
GENERATED PROGRAMS FOR ITERATION 0:
['def zip_lists(list1, list2):\n    return [list(a) for a in zip(list1, list2)]']
Mismatch for inputs [[[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]]:
Reference output: [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]]
Candidate output: [[[1, 3], [2, 4]], [[5, 7], [6, 8]], [[9, 11], [10, 12, 14]]]
INCONSISTENCIES:  {'Program 1': {'passes': False, 'failed_tests': [{'inputs': [[[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]], 'reference_output': [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]], 'candidate_output': [[[1, 3], [2, 4]], [[5, 7], [6, 8]], [[9, 11], [10, 12, 14]]]}]}}
CONDITION:  False
COUNTEREXAMPLE: {'inputs': [[[1, 3], [5, 7], [9, 11]], [[2, 4], [6, 8], [10, 12, 14]]], 'reference_output': [[1, 3, 2, 4], [5, 7, 6, 8], [9, 11, 10, 12, 14]], 'candidate_output': [[[1, 3], [2, 4]], [[5, 7], [6, 8]], [[9, 11], [10, 12, 14]]]}
REQUIREMENTS:
['Write a function 

### Dataset testing

Note that I've found some pretty terrible attributes of this dataset: for example, the second row has a typo in the requirements which makes it impossible for anyone including humans to solve the problem (ludic number -> lucid number, which doesnt exist).

In [None]:

# Iterate through dataset and run MUS computation
for index, row in df.iterrows():
    task_id = row['task_id']
    initial_requirements = row['text']
    program = row['code']
    tests = row['test_list']

    print(f"\nProcessing Task ID: {task_id}")
    mus_accuracy_evaluator.compute_mus(
        program, 
        initial_requirements, 
        tests,
        task_id,
        3,
        5
    )

# Calculate and display accuracy
accuracy, results_df = mus_accuracy_evaluator.calculate_accuracy()
print(accuracy)
print(results_df)


Processing Task ID: 601
REQUIREMENTS:
Write a function to find the longest chain which can be formed from the given set of pairs.
GENERATED PROGRAMS FOR ITERATION 0:
GENERATE PROGRAMS NO ENTROPY
def longest_chain(pairs):
    pairs.sort(key=lambda x: x[1])
    current, length = float('-inf'), 0
    for pair in pairs:
        if current < pair[0]:
            current = pair[1]
            length += 1
    return length
GENERATE PROGRAMS NO ENTROPY
def longest_chain(pairs):
    pairs.sort(key=lambda x: x[1])
    current, length = float('-inf'), 0
    for pair in pairs:
        if current < pair[0]:
            current = pair[1]
            length += 1
    return length
GENERATE PROGRAMS NO ENTROPY
def longest_chain(pairs):
    pairs.sort(key=lambda x: x[1])
    current, length = float('-inf'), 0
    for pair in pairs:
        if current < pair[0]:
            current = pair[1]
            length += 1
    return length
Error parsing test case max_chain_length([Pair(5, 24), Pair(15, 25),Pai