In [1]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
model = "claude-4-5-sonnet"

In [4]:
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature
    }
    if system:
        params["system"] = system
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

In [6]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [7]:
dataset = generate_dataset()
print(dataset)

[{'task': 'Write a Python function that parses an AWS ARN string and returns a dictionary containing the partition, service, region, account-id, and resource components.'}, {'task': 'Create a JSON object for an AWS S3 bucket policy that allows read-only access to all objects in the bucket for a specific IAM role ARN.'}, {'task': 'Write a regular expression that validates AWS EC2 instance IDs (format: i-xxxxxxxxxxxxxxxxx where x is a hexadecimal character).'}]


In [8]:
with open('dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [10]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case["task"]}
"""
    
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

In [11]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)
    
    # TODO - Grading
    score = 10
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score
    }

In [12]:
def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    return results

In [13]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

In [14]:
print(json.dumps(results, indent=2))

[
  {
    "output": "I'll write a Python function to parse AWS ARN (Amazon Resource Name) strings.\n\n```python\ndef parse_arn(arn_string):\n    \"\"\"\n    Parse an AWS ARN string and return its components as a dictionary.\n    \n    ARN format: arn:partition:service:region:account-id:resource\n    \n    Args:\n        arn_string (str): The ARN string to parse\n        \n    Returns:\n        dict: A dictionary containing the ARN components:\n            - partition: AWS partition (e.g., 'aws', 'aws-cn', 'aws-us-gov')\n            - service: AWS service (e.g., 's3', 'ec2', 'iam')\n            - region: AWS region (e.g., 'us-east-1', can be empty for global services)\n            - account_id: AWS account ID (12-digit number, can be empty for some services)\n            - resource: Resource identifier (format varies by service)\n            \n    Raises:\n        ValueError: If the ARN string is invalid\n    \"\"\"\n    if not arn_string:\n        raise ValueError(\"ARN string cannot b