In [None]:
%pip install -r requirements.txt

In [None]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv(override=True)

client = AzureOpenAI(
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    api_version="2025-04-01-preview",
)

In [None]:
DEVELOPER_PROMPT = "Clippy is a factual chatbot that is also sarcastic."

BASE_MODELS = [
    "o4-mini",
    "gpt-4.1",
    "gpt-4.1-mini",
    "gpt-4.1-nano",
    "gpt-4o",
    "gpt-4o-mini"
]

GRADER_MODEL = "o3"
GRADER_PROMPT = """
You are an expert in sarcasm detection, while also being a good fact checker.

You'll be given a conversation in the form of a question and answer. 

## Scoring Criteria
Judge the answer using two metrics.

### Metric 1: How sarcastic is the response?
On a scale from 1 to 10 inclusive with 1 being no sarcasm detected and 10 being
the most sarcastic response ever. Assign an integer value within that range based
on your assessment.

### Metric 2: How accurate is the response?
Assign a 1 if the response is factually correct. Assign a 0 for this metric if it's
incorrect or contains innacuracies.

### Final Score
The final score you must decide should be based on a weighted blend of Metric 1 and
Metric 2 using the formula: `(Metric 1) * (Metric 2)`

This means that if Metric 2 is zero, the final score must be zero.

## Response Structure
Your response must be in a JSON format that can be loaded by Python's json.loads()
function. It should resemble the following:

```
{
  "steps": [
    { 
      "description": <one sentence describing your reasoning for Metric 1>", 
      "result": <string representation of Metric 1's score> 
    },
    { 
      "description": <one sentence describing your reasoning for Metric 1>", 
      "result": <string representation of Metric 1's score> 
    }
  ],
  "result": <floating point value of the Final Score>
}

## General Guidance
The questions should be simple factual questions. Deep research is not required.
```
"""

In [None]:
eval_file = None
with open("./sarcasm-qa-sample.jsonl", "rb") as f:
    eval_file = client.files.create(purpose="evals", file=f)
    client.files.wait_for_processing(eval_file.id)
    print(f"Created eval file: {eval_file.id}")


In [None]:
# Let's define an evaluation using some static input from question/answer pairs.

# The entire user prompt is data driven from the file. No generation is done using
# a model, just simple string substitution using this pattern.
USER_PROMPT = """
Q: {{item.question}}
A: {{item.answer}}
"""
INPUT = [
    {
        "type": "message",
        "role": "developer",
        "content": { "type": "input_text", "text": GRADER_PROMPT }
    },
    {
        "type": "message",
        "role": "user",
        "content": { "type": "input_text", "text": USER_PROMPT }
    }
]

# We need to describe what our evaluation dataset looks like.
SCHEMA = {
    "type": "object",
    "properties": {
        "question": { "type": "string" },
        "answer": { "type": "string" },
    }
}
DATA_SOURCE = {
    "item_schema": SCHEMA,
    "include_sample_schema": False,
    "type": "custom",
}

# Lastly, we define test criteria that combines the above.
TESTING_CRITERIA = {
    "name": "Auto Sarcasm Grader",
    "type": "score_model",
    "model": GRADER_MODEL,
    "input": INPUT,
    "range": [1.0, 10.0],
    "pass_threshold": 4.0,
}

In [None]:
import uuid
suffix = str(uuid.uuid4()).split("-")[0]

sample_eval = client.evals.create(
    name=f"sarcasm-sampledata-{suffix}",
    data_source_config=DATA_SOURCE,
    testing_criteria=[TESTING_CRITERIA],
)

print(f"Submitted evaluation {sample_eval.id}")

# Start a Run
# Need to tell it where our sample data lives.
RUN_DATA_SOURCE = {
    "type": "jsonl",
    "source": { "type": "file_id", "id": eval_file.id }
}
sample_run = client.evals.runs.create(
    name=f"sample-data-{GRADER_MODEL}",
    eval_id=sample_eval.id,
    data_source=RUN_DATA_SOURCE,
)
print(f"Submitted run {sample_run.id} to eval {sample_eval.id}")


In [None]:
# Wait for the evals to complete.
from IPython.display import clear_output
import time

start_time = time.time()

sample_run = client.evals.runs.retrieve(eval_id=sample_eval.id, run_id=sample_run.id)
while sample_run.status not in ["completed", "failed"]:
    time.sleep(5)
    clear_output(wait=True)

    sample_run = client.evals.runs.retrieve(eval_id=sample_eval.id, run_id=sample_run.id)
    
    now = time.time()
    mins, secs = int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)
    print(f"Elapsed time: {mins} minutes {secs} seconds")

print(f"Run {sample_run.id}: {sample_run.status}")

In [None]:
# Display an evaluation summary
from scripts.eval_utils import display_evaluation_summary

display_evaluation_summary(client, [sample_eval.id], x_range=(0, 10))


In [None]:
qa_file = None
with open("./sarcasm-qa.jsonl", "rb") as f:
    qa_file = client.files.create(purpose="evals", file=f)
    client.files.wait_for_processing(qa_file.id)
    print(f"Created eval file: {qa_file.id}")

In [None]:
# Let's test some base models!

# GRADER_PROMPT is re-used!
# We *do* create a new USER_PROMPT as now we generate the Answer via a model
# under test. Notice how we reference {{sample.output_text}}.
USER_PROMPT = """
Q: {{item.question}}
A: {{sample.output_text}}
"""
INPUT = [
    {
        "type": "message",
        "role": "developer",
        "content": { "type": "input_text", "text": GRADER_PROMPT }
    },
    {
        "type": "message",
        "role": "user",
        "content": { "type": "input_text", "text": USER_PROMPT }
    }
]
SCHEMA = {
    "type": "object",
    "properties": {
        "question": { "type": "string" },
        "answer": { "type": "string" },
    },
}
DATA_SOURCE = {
    "item_schema": SCHEMA, 
    "include_sample_schema": True, # Note this change! Needed for data gen.
    "type": "custom"
}
TESTING_CRITERIA = {
    "name": "Auto Sarcasm Grader",
    "type": "score_model",
    "model": GRADER_MODEL,
    "input": INPUT,
    "range": [1.0, 10.0],
    "pass_threshold": 4.0,
}

eval = client.evals.create(
    name=f"sacarsm-evaluation-{suffix}",
    data_source_config=DATA_SOURCE,
    testing_criteria=[TESTING_CRITERIA]
)
print(f"Created eval {eval.id}")

runs = []
for model in BASE_MODELS:
    DATA_SOURCE = {
        "type": "completions",
        "model": model,
        "source": { "type": "file_id", "id": qa_file.id },
        "input_messages": {
            "type": "template",
            "template": [
                { 
                    "type": "message", 
                    "role": "developer", 
                    "content": { "type": "input_text", "text": DEVELOPER_PROMPT },
                },
                { 
                    "type": "message", 
                    "role": "user", 
                    "content": { "type": "input_text", "text": "{{item.question}}" },
                },
            ],
        },
        "sampling_params": { "max_completions_tokens": 20_000 } if model.startswith("o") else { "max_completions_tokens": 100 }, # XXX
    }
    run = client.evals.runs.create(
        name=f"{model}-{suffix}", 
        eval_id=eval.id,
        data_source=DATA_SOURCE, 
    )
    print(f"Created run {run.id} for eval {eval.id}")
    runs.append(run)


In [None]:
# Wait for the Runs to complete.
from IPython.display import clear_output
import time

start_time = time.time()

while any([r.status != "completed" for r in runs]):
    time.sleep(10)
    clear_output(wait=True)

    for i in range(len(runs)):
        runs[i] = client.evals.runs.retrieve(eval_id=eval.id, run_id=runs[i].id)
        print(f"Run {runs[i].name}: {runs[i].status}")
    
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))

print(f"All {len(runs)} runs completed!")

In [None]:
# Import the display_evaluation_summary function from the eval_utils script
from scripts.eval_utils import display_evaluation_summary

# Display the evaluation summary
display_evaluation_summary(client, [eval.id], x_range=(0, 10))


In [None]:
import json
from tqdm import tqdm

# Let's try making 4.1-nano be a little better. 4.1 seems to be the best here, so let's have it be the teacher.
TEACHER_MODEL = "gpt-4.1"

# We'll reuse the same QA data set, but we'll need to reformat it into Chat Completions format.
chats = []
with open("./sarcasm-qa.jsonl", "r") as f:
    for line in f.readlines():
        qa = json.loads(line)
        chats.append([
            { "role": "system", "content": DEVELOPER_PROMPT },
            { "role": "user", "content": qa["question"] },
        ])

# We'll keep this simple and serialized. If you wanted to do this at scale, you should parallelize this step
# so you aren't generating one chat completion at a time ;)
errors = []
replies = []
print(f"🧪 Attempting to distill {len(chats)} prompts.")
for idx, chat in enumerate(tqdm(chats)):
    metadata = {
        "dataset": "sarcasm",
        "teacher": TEACHER_MODEL,
        "index": str(idx),
        "suffix": suffix,
    }
    try:
        reply = client.chat.completions.create(
            model=TEACHER_MODEL,
            messages=chat,
            store=True,
            metadata=metadata,
            max_completion_tokens=100
        )
        messages = chat + [{ "role": "assistant", "content": reply.choices[0].message.content.strip() }]
        replies.append({ "messages": messages })
    except Exception as e:
        errors.append(f"⚠️ prompt {idx} failed: {e}")

print(f"🧪 Distilled {len(replies)} out of a possible {len(chats)} prompts.")


In [None]:
# Now we need to create our SFT training and validation set. We'll do a pretty simple split here.
dataset = {
    "training": replies[:80],
    "validation": replies[80:],
}

# Now we create files via the Files API. To keep it simple, we write them to disk first and then
# we use the SDK to upload.
files = {
    "training": None,
    "validation": None,
}
for name, chats in dataset.items():
    with open(f"sarcasm-{name}.jsonl", "w", encoding="utf-8") as f:
        for chat in chats:
            json.dump(chat, f)
            f.write("\n")
    with open(f"sarcasm-{name}.jsonl", "rb") as f:
        file = client.files.create(file=f, purpose="fine-tune")
        file = client.files.wait_for_processing(file.id)
        files[name] = file
    print(f"Created {name} file:\n{file.model_dump_json(indent=2)}")


In [None]:
# Now we submit our fine-tuning job!
STUDENT_MODEL = "gpt-4.1-mini-2025-04-14"
SUFFIX = f"{TEACHER_MODEL}-sarcasm".replace(".", "") # '.' is a reserved character 😜

job = client.fine_tuning.jobs.create(
    model=STUDENT_MODEL,
    suffix=SUFFIX,
    training_file=files["training"].id,
    validation_file=files["validation"].id,
    extra_body={ "trainingType": "globalstandard" },
)
print(f"Created fine-tuning job:\n{job.to_json(indent=2)}")

In [None]:
# Wait for our FT job to complete.
from IPython.display import clear_output
import time

start_time = time.time()

status = job.status
while status not in ["succeeded", "failed", "cancelled"]:
    time.sleep(15)
    job = client.fine_tuning.jobs.retrieve(job.id)
    status = job.status
    clear_output(wait=True)
    print(f"Job {job.id}: {status}")
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))

if status == "succeeded":
    print(f"Fine-tuning finished!")
else:
    raise RuntimeError(f"Fine-tuning job did not complete successfully (status={status})")

In [None]:
# First we need to deploy the new model candidate. We'll use Developer Tier to keep
# costs to just per-token!
import os
from azure.identity import DefaultAzureCredential
from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient

# The OpenAI client cannot control the Azure OpenAI control plane, so we need a
# different client specific to Azure Cognitive Services.
cogsvc_client = CognitiveServicesManagementClient(
    credential=DefaultAzureCredential(),
    subscription_id=os.environ.get("AZURE_SUBSCRIPTION_ID")
)

# Define our Deployment. Note the use of SKU for specificy capacity and
# the name of the deployment tier.
DEPLOYMENT_NAME = f"sarcasm-{TEACHER_MODEL.replace(".", "")}-distilled"
DEPLOYMENT = {
    "properties": {
        "model": { 
            "format": "OpenAI", 
            "name": job.fine_tuned_model, 
            "version": "1" 
        },
    },
    "sku": { 
        "capacity": 250, 
        "name": "DeveloperTier" 
    },
}

# Submit the request for provisioning. This may take a few minutes, so we
# poll for updates. If it already exists, this should return quickly.
deployment = cogsvc_client.deployments.begin_create_or_update(
    resource_group_name=os.environ.get("AZURE_RESOURCE_GROUP"),
    account_name=os.environ.get("AZURE_AOAI_ACCOUNT"),
    deployment_name=DEPLOYMENT_NAME,
    deployment=DEPLOYMENT,
)
print(f"Submitted deployment {deployment}")

In [None]:
# Wait for our deployment to finish provisioning.
from IPython.display import clear_output
import time

start_time = time.time()

status = deployment.status()
print(f"Provisioning {DEPLOYMENT_NAME}: {status}")

while status not in ["Succeeded", "Failed"]:
    deployment.wait(5)
    status = deployment.status()
    clear_output(wait=True)
    print(f"Provisioning {DEPLOYMENT_NAME}: {status}")
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))

print(f"Provisioning finished!")


In [None]:
# Now we add a new Run to our existing Evaluation so we can easily compare our
# distilled model against the base models to see if we successfully improved
# our student model.
DATA_SOURCE = {
    "type": "completions",
    "model": DEPLOYMENT_NAME,
    "source": { "type": "file_id", "id": qa_file.id },
    "input_messages": {
        "type": "template",
        "template": [
            { 
                "type": "message", 
                "role": "developer", 
                "content": { "type": "input_text", "text": DEVELOPER_PROMPT },
            },
            { 
                "type": "message", 
                "role": "user", 
                "content": { "type": "input_text", "text": "{{item.question}}" },
            },
        ],
    },
    "sampling_params": {
        "max_completions_tokens": 250, # we SFT'd a non-reasoning model, so keep this tight
        # the above should be `max_completion_tokens`...note the singular completion!
    }
}

# Submit the Run.
run = client.evals.runs.create(
    name=f"AutoGrader-{job.fine_tuned_model}-{suffix}", eval_id=eval.id, data_source=DATA_SOURCE,
)
print(f"Created new run:\n{run.to_json(indent=2)}")

In [None]:
# Wait for the new Run to complete.
from IPython.display import clear_output
import time

start_time = time.time()

while run.status not in ["completed", "failed"]:
    time.sleep(10)
    clear_output(wait=True)
    run = client.evals.runs.retrieve(eval_id=eval.id, run_id=run.id)
    print(f"Run {run.id}: {run.status}")
    
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))

print(f"Run {run.id} completed!")
print(run)

In [None]:
# Import the display_evaluation_summary function from the eval_utils script
from scripts.eval_utils import display_evaluation_summary

# Display the evaluation summary
display_evaluation_summary(client, [eval.id])