In [1]:
import json
import sys
import warnings
from pathlib import Path
from pprint import pprint
from typing import Literal, Type, TypeAlias

from openai import OpenAI
from pydantic import BaseModel
from datasets import load_dataset

In [2]:
ds = load_dataset("bigcode/bigcodebench", split="v0.1.4")

In [3]:
sample = 95

In [4]:
ds[sample]

{'task_id': 'BigCodeBench/95',
 'complete_prompt': 'import pandas as pd\nfrom random import randint, uniform, seed\n\ndef task_func(categories=None, months=None, random_seed=42):\n    """\n    Generates a DataFrame with simulated monthly sales data for various product categories, ensuring reproducibility through the use of a random seed.\n\n    Parameters:\n        categories (list of str, optional): A list specifying the product categories to include in the report. If not provided, defaults to [\'Electronics\', \'Clothing\', \'Home & Kitchen\', \'Books\', \'Beauty & Personal Care\'].\n        months (list of str, optional): A list specifying the months to include in the report. If not provided, defaults to [\'January\', \'February\', \'March\', \'April\', \'May\', \'June\', \'July\', \'August\', \'September\', \'October\', \'November\', \'December\'].\n        random_seed (int, optional): The seed value for the random number generator to ensure the reproducibility of the sales data. D

In [5]:
print(ds[sample]["test"])

import unittest
import pandas as pd
class TestCases(unittest.TestCase):
    def test_reproducibility(self):
        df1 = task_func(random_seed=42)
        df2 = task_func(random_seed=42)
        pd.testing.assert_frame_equal(df1, df2)
    def test_dataframe_structure(self):
        df = task_func()
        self.assertEqual(list(df.columns), ['Month', 'Category', 'Sales'])
        self.assertEqual(len(df), 60)  # 12 months * 5 categories
    def test_invalid_categories(self):
        with self.assertRaises(ValueError):
            task_func(categories="Not a list")
    def test_invalid_months(self):
        with self.assertRaises(ValueError):
            task_func(months=123)
    def test_custom_categories_and_months(self):
        custom_categories = ['A', 'B', 'C']
        custom_months = ['Jan', 'Feb']
        df = task_func(categories=custom_categories, months=custom_months)
        self.assertEqual(len(df), len(custom_categories) * len(custom_months))
        self.assertTrue(set(d

In [6]:
print(ds[sample]["complete_prompt"] + ds[sample]["canonical_solution"])

import pandas as pd
from random import randint, uniform, seed

def task_func(categories=None, months=None, random_seed=42):
    """
    Generates a DataFrame with simulated monthly sales data for various product categories, ensuring reproducibility through the use of a random seed.

    Parameters:
        categories (list of str, optional): A list specifying the product categories to include in the report. If not provided, defaults to ['Electronics', 'Clothing', 'Home & Kitchen', 'Books', 'Beauty & Personal Care'].
        months (list of str, optional): A list specifying the months to include in the report. If not provided, defaults to ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'].
        random_seed (int, optional): The seed value for the random number generator to ensure the reproducibility of the sales data. Defaults to 42.

    Returns:
        pandas.DataFrame: A DataFrame with three columns: 'Month', 

In [7]:
import json

with open("samples_files.json", "r", encoding="utf-8") as f:
    samples = json.load(f)

In [8]:
import pprint

In [9]:
pprint.pprint(samples["sample_42"].keys())

dict_keys(['pca_transformation.py', 'dataframe_creation.py', 'variance_plotting.py', 'tests.py', 'task.py'])


In [10]:
openai_client = OpenAI()

In [11]:
Message: TypeAlias = dict[Literal["role", "content"], str]
Messages: TypeAlias = list[Message]


# @retry_with_exponential_backoff
def generate_structured_response(
    model: str,
    messages: Messages,
    response_format: Type,
    temperature: float = 1,
    max_tokens: int = 1000,
    stop_sequences: list[str] = [],
) -> dict:
    """
    Generate a response using the OpenAI or Anthropic APIs, with a particular response format.

    Args:
        model (str): The name of the model to use (e.g., "gpt-4o-mini").
        messages (list[dict] | None): A list of message dictionaries with 'role' and 'content' keys.
        response_format (Type): The class to use for the response format.
        temperature (float): Controls randomness in output. Higher values make output more random.
        max_tokens (int): The maximum number of tokens to generate.
        verbose (bool): If True, prints the input messages before making the API call.
        stop_sequences (list[str]): A list of strings to stop the model from generating.

    Returns:
        dict: The model's response, as a dict with the same structure as the `response_format` class
            we pass in.
    """
    if model not in ["gpt-4o-mini", "claude-3-5-sonnet-20240620"]:
        warnings.warn(f"Warning: using unexpected model {model!r}")

    try:
        if "gpt" in model:
            response = openai_client.beta.chat.completions.parse(
                model=model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                stop=stop_sequences,
                response_format=response_format,
            )
            return json.loads(response.choices[0].message.content)
        else:
            raise ValueError(f"Unknown model {model!r}")

    except Exception as e:
        raise RuntimeError(f"Error in generation:\n{e}") from e

In [13]:
class Answers(BaseModel):
    A: str
    B: str


class Question(BaseModel):
    system: str
    question: str
    answers: Answers
    answer_matching_behavior: Literal["A", "B"]
    answer_not_matching_behavior: Literal["A", "B"]
    behavior_category: str


class QuestionGeneration(BaseModel):
    reasoning: str  # Allow model to do chain-of-thought reasoning before generating the questions
    questions: list[Question]  # List of questions, all formatted the same way


response = generate_structured_response(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": "Generate 4 factual questions about France's culture.",
        }
    ],
    response_format=QuestionGeneration,
)

In [14]:
sample = samples["sample_42"]
base_sample = samples["sample_42_base"]

In [15]:
base_case = samples["sample_95_base"]

In [23]:
USER_PROMPT = f"""
I want to modify a coding challenge such that the solution consists of 3 to 5 functions that are called inside task_func() to produce the right answer. 
I will task a separate participant with implementing each of these  functions to test how well they can collaborate. 
Please help me rewrite the task task.py file in the sample below such that it uses these 3 to 5 functions. Each of them should be in its own separate file.
They can encompass different logical components of the solution. The solution must remain equivalent. If any of the functions is not properly implemented, at leas one test should fail.

task.py of the base example contains a solved coding challenge. The docstring plays the role of the challenge description and code body is the solution. 
tests.py containts the tests that determine if the challenge is correctly solved and should remain unchanged.

BASE EXAMPLE:
{base_sample}

TARGET EXAMPLE:
{sample}

Please come up with the TARGET for the following BASE case:
{base_case}

Your response should comprise of only the dictionary of files in the exact same format.
"""

In [24]:
def generate_response_basic(
    model: str,
    messages: Messages,
    temperature: float = 1,
    max_tokens: int = 1000,
    verbose: bool = False,
    stop_sequences: list[str] = [],
) -> str:
    """
    Generate a response using the OpenAI or Anthropic APIs.

    Args:
        model (str): The name of the model to use (e.g., "gpt-4o-mini").
        messages (list[dict] | None): A list of message dictionaries with 'role' and 'content' keys.
        temperature (float): Controls randomness in output. Higher values make output more random.
        max_tokens (int): The maximum number of tokens to generate.
        verbose (bool): If True, prints the input messages before making the API call.
        stop_sequences (list[str]): A list of strings to stop the model from generating.

    Returns:
        str: The generated response from the OpenAI/Anthropic model.
    """
    if model not in ["gpt-4o-mini", "claude-3-5-sonnet-20240620"]:
        warnings.warn(f"Warning: using unexpected model {model!r}")

    if verbose:
        print(
            tabulate(
                [m.values() for m in messages],
                ["role", "content"],
                "simple_grid",
                maxcolwidths=[50, 70],
            )
        )

    # API call
    try:
        if "gpt" in model:
            response = openai_client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                max_completion_tokens=max_tokens,
                stop=stop_sequences,
            )
            return response.choices[0].message.content
        elif "claude" in model:
            has_system = messages[0]["role"] == "system"
            kwargs = {"system": messages[0]["content"]} if has_system else {}
            response = anthropic_client.messages.create(
                model=model,
                messages=messages[1:] if has_system else messages,
                temperature=temperature,
                max_tokens=max_tokens,
                stop_sequences=stop_sequences,
                **kwargs,
            )
            return response.content[0].text
        else:
            raise ValueError(f"Unknown model {model!r}")

    except Exception as e:
        raise RuntimeError(f"Error in generation:\n{e}") from e

In [29]:
response = generate_response_basic(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": USER_PROMPT}],
)
print("MODEL RESPONSE:\n")
print(response)

MODEL RESPONSE:

```python
{
    'generate_sales_data.py': 'import pandas as pd\nfrom random import randint, uniform, seed\n\ndef generate_sales_data(categories, months, random_seed):\n    """\n    Generates a list of sales data for each category-month combination.\n    \n    Parameters:\n        categories (list of str): A list specifying the product categories.\n        months (list of str): A list specifying the months.\n        random_seed (int): The seed value for the random number generator.\n    \n    Returns:\n        list: A list of lists containing month, category, and sales.\n    """\n    seed(random_seed)  # Setting the seed for reproducibility\n    sales_data = []\n\n    for month in months:\n        for category in categories:\n            sales = randint(100, 500) + uniform(0, 1)\n            sales_data.append([month, category, sales])\n    return sales_data',
    'create_dataframe.py': 'import pandas as pd\n\ndef create_dataframe(sales_data):\n    """\n    Create a Data

In [26]:
class Files(BaseModel):
    target_case: dict


class FilesGeneration(BaseModel):
    reasoning: str  # Allow model to do chain-of-thought reasoning before generating the files
    variants: list[Files]  # List of files, all formatted the same way

In [28]:
response = generate_structured_response(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": USER_PROMPT}],
    response_format=FilesGeneration,
)
print("MODEL RESPONSE:\n")
print(response)

RuntimeError: Error in generation:
Error code: 400 - {'error': {'message': "Invalid schema for response_format 'FilesGeneration': In context=('properties', 'target_case'), 'additionalProperties' is required to be supplied and to be false.", 'type': 'invalid_request_error', 'param': 'response_format', 'code': None}}

In [None]:
# Save the response to a file
with open(section_dir / f"{evaluation_target}_{num_q_zeroshot}_qs.json", "w") as f:
    json.dump(response["questions"], f)