In [12]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
model = "claude-3-5-haiku-latest"

In [13]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [14]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete. Also, you should include a solution criteria, explaining what an awesome
solution would look like. You should think about tasks that are relevant for AWS developers.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "python" or "json" or "regex",
        "solution_criteria": "Description of what an awesome solution would look like"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 4 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")

    answer = chat(messages, stop_sequences=["```"])
    return json.loads(answer)

In [15]:
# Generate a dataset and store it on a json file
dataset = generate_dataset()

with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)


In [16]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
    You are an expert in AWS programmer, your task is to solve the following task:

    {test_case["task"]}

    <output>
    You should only return JSON, Python or Regex code as output.
    Do NOT add any comments or commentary explanation.
    </output>
    """

    messages = []

    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")

    output = chat(messages, stop_sequences=["```"])
    return output

In [23]:
def grade_by_model(test_case, output):
    eval_prompt = f"""
    You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

    Original Task:
    <task>
    {test_case["task"]}
    </task>

    Solution to Evaluate:
    <solution>
    {output}
    </solution>

    Criteria to use for Evaluation:
    <solution_criteria>
    {test_case["solution_criteria"]}
    </solution_criteria>
    
    Output Format
    Provide your evaluation as a structured JSON object with the following fields, in this specific order:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement
    - "reasoning": A concise explanation of your overall assessment
    - "score": A number between 1-10

    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    """

    messages = []

    add_user_message(messages, eval_prompt)

    add_assistant_message(messages, "```json")

    eval_text = chat(messages, stop_sequences=["```"])

    return json.loads(eval_text)

In [18]:
def validate_regex(text):
    try:
        import re
        re.compile(text.strip())
        return 10
    except re.error:
        return 0
    
def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0
    
def validate_python(text):
    try:
        import ast
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0

def grade_syntax(response, test_case):
    if test_case["format"] == "regex":
        return validate_regex(response)
    elif test_case["format"] == "json":
        return validate_json(response)
    elif test_case["format"] == "python":
        return validate_python(response)
    else:
        return 0

In [19]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    model_grade = grade_by_model(test_case, output)
    code_grade = grade_syntax(output, test_case)
    score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    return {
        "output": output,
        "test_case": test_case,
        "final_score": (score + code_grade) / 2,
        "model_score": score,
        "code_score": code_grade,
        "reasoning": reasoning
    }

In [20]:
from statistics import mean

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["final_score"] for result in results])
    print(average_score)    

    return results

In [24]:
# Run eval pipeline

with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

8.375


In [25]:
print(json.dumps(results, indent=2))

[
  {
    "output": "\n{\n    \"Version\": \"2012-10-17\",\n    \"Statement\": [\n        {\n            \"Effect\": \"Allow\",\n            \"Action\": [\n                \"s3:ListBucket\",\n                \"s3:GetObject\"\n            ],\n            \"Resource\": [\n                \"arn:aws:s3:::company-data\",\n                \"arn:aws:s3:::company-data/*\"\n            ]\n        }\n    ]\n}\n",
    "test_case": {
      "task": "Create a JSON policy that restricts an IAM user to only list and read objects in a specific S3 bucket named 'company-data'",
      "format": "json",
      "solution_criteria": "Policy should use least privilege principles, explicitly allow s3:ListBucket and s3:GetObject actions only on the specified bucket, and deny all other S3 actions"
    },
    "final_score": 8.5,
    "model_score": 7,
    "code_score": 10,
    "reasoning": "The policy provides basic read-only access to the company-data bucket but lacks comprehensive least privilege implementation. 