In [None]:
%pip install -r requirements.txt

# Setup
You should have a local `.env` file with at least:
* `X_FUNCTIONS_KEY`
* `AZURE_FUNCTIONS_ENDPOINT`
* `AZURE_OPENAI_API_KEY`
* `AZURE_OPENAI_ENDPOINT`


In [None]:
from dotenv import load_dotenv

load_dotenv(override=True)

For now, we're using an API key. We configure the non-Azure OpenAI client to talk to the OpenAI/v1 API in Foundry.

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    base_url=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)
print("> Created OpenAI client.")

In [None]:
# We'll create a "unique enough" identifier that lets us run this notebook
# multiple times and easily keep track of things each run creates.
import uuid

UNIQUE_ENOUGH_KEY = str(uuid.uuid4()).split("-")[0]
print(f"> Using unique enough key: {UNIQUE_ENOUGH_KEY}")

In [None]:
DEVELOPER_PROMPT = """
You are an expert in arithmetic problem solving. Given a target number and a list of
numbers, your task is to combine all of the numbers exactly once using addition (+),
subtraction (-), multiplication (x), or division (/) to reach the target.

- You must use every number exactly once.
- Use parentheses as needed to control the order of operations.
- Return only a valid JSON object with the following exact syntax:
  {
    "expression": <the expression>,
    "result": <the result>
  }
- The expression should be a string representation of the mathematical expression.
- The result should be an integer.
- If the exact target is not possible, return the closest valid result using all numbers.

# Example 1
target: 850
numbers: [100, 75, 50, 25, 6, 3]

Output:
{
  "expression": "((100 * 6) + (75 + 25) + (3 * 50))",
  "result": 850
}

# Example 2
target: 945
numbers: [25, 100, 9, 3, 6, 2]

Output:
{
  "expression": "((100 * 9) + (6 * 3) + (25 + 2))",
  "result": 945
}

# Example 3
target: 310
numbers: [7, 50, 75, 3, 8, 2]

Output:
{
  "expression": "((75 * 3) + (50 * 2) - (8 + 7))",
  "result": 310
}
""".strip()

# Baseline Evaluation
We're first going to assess how some models perform natively at this task given the above prompt.

In [None]:
# Upload our datasets for eval

with open("./data/countdown_eval_100.jsonl", mode="rb") as f:
    eval_file = client.files.create(file=f, purpose="evals")
eval_file = client.files.wait_for_processing(eval_file.id)

print(f"> Uploaded eval file {eval_file.id}")

In [None]:
# We need to describe the source and schema for our files.
EVAL_DATA_SOURCE = {
    "item_schema": {
        "type": "object",
        "properties": {
            "target": {"type": "integer"},
            "nums": {"type": "array", "items": {"type": "integer"}},
        },
    },
    "include_sample_schema": True,
    "type": "custom",
}

In [None]:
# Our expected Response schema from the models. We'll use structured outputs
# so the model must output only JSON.
RESPONSE_SCHEMA = {
    "type": "json_schema",
    "json_schema": {
        "name": "math_expression",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "expression": {"type": "string"},
                "result": {"type": "integer"},
            },
            "required": ["expression", "result"],
            "additionalProperties": False,
        },
    },
}

## Endpoint Grader
Our endpoint grader points to a remote endpoint that hosts the grader function.

It has an API surface that's basically the Python grader, but via POST'ing JSON
to an endpoint.

We're using an Azure Function with key based auth, so each request will need
the `X-Functions-Key` header with our authentication key.

In [None]:
# First we'll define our Grader.
import os

URL = os.getenv("AZURE_FUNCTIONS_ENDPOINT")
ENDPOINT_GRADER = {
    "type": "endpoint",
    "name": "Remote Countdown Grader",
    "url": URL,
    "headers": {
        "X-Functions-Key": os.getenv("X_FUNCTIONS_KEY"),
    },
    "pass_threshold": 5.0,
    # "rate_limit": 50,
}
print(f"> Defined endpoint grader for {URL}")

For comparison, we'll use the same code but run as a Python grader.

In practice, this makes no sense...but it demonstrates how similar the two
graders are in shape.

The code is in [grader.py](./grader.py), so we can import the module and
use Python's native `inspect` module to dump it as source code into a
string we pass to the grader.

In [None]:
# Let's load an analagous Python Grader using the same code.
import inspect
import grader

code = inspect.getsource(grader)

PYTHON_GRADER = {
    "type": "python",
    "name": "Python Countdown Grader",
    "source": code,
    "pass_threshold": 5.0,
}
print("> Defined Python grader:")

Let's now make sure our Endpoint Grader is reachable. If it's not, there's no sense in
submitting an Eval or RFT job. We can POST some sample input and check if we get back
a score.

In [None]:
# Manually test the grader.
import requests

data = {
    "item": {"target": 47, "nums": [86, 22, 88, 72]},
    "sample": {
        "output_json": {
            "expression": "(72 + 22) / (88 - 86)",
            "result": 47,
        },
    },
}

result = requests.post(
    URL,
    headers={
        "Content-Type": "application/json",
        "X-Functions-Key": os.getenv("X_FUNCTIONS_KEY"),
    },
    json=data,
)

print(f"> Got HTTP {result.status_code}")
if result.status_code == 200:
    score = result.json()["score"]
    print(f"> Score is {score}.")
    if score != 5:
        raise RuntimeError("Uh oh...you might want to check your grader!")

Now we're ready to define our Eval.

In [None]:
# Create our Eval
baseline_eval = client.evals.create(
    name=f"endpoint-countdown-eval-{UNIQUE_ENOUGH_KEY}",
    data_source_config=EVAL_DATA_SOURCE,
    # testing_criteria=[ENDPOINT_GRADER],
    # testing_criteria=[PYTHON_GRADER],
    testing_criteria=[PYTHON_GRADER, ENDPOINT_GRADER],
)
print(f"> Created eval {baseline_eval.id}:")
print(baseline_eval.to_json())

In [None]:
# Define our Runs
USER_PROMPT = """
target: {{item.target}}
numbers: {{item.nums}}
""".strip()

RUNS = []
for model in ["gpt-4.1", "o4-mini", "gpt-5"]:
    RUN_DATA_SOURCE = {
        "type": "completions",
        "model": model,
        "source": {"type": "file_id", "id": eval_file.id},
        "input_messages": {
            "type": "template",
            "template": [
                {
                    "type": "message",
                    "role": "developer",
                    "content": {"type": "input_text", "text": DEVELOPER_PROMPT},
                },
                {
                    "type": "message",
                    "role": "user",
                    "content": {"type": "input_text", "text": USER_PROMPT},
                },
            ],
        },
        "sampling_params": {
            "response_format": RESPONSE_SCHEMA,
        },
    }
    run = client.evals.runs.create(
        name=f"endpoint-countdown-eval-run-{model}-{UNIQUE_ENOUGH_KEY}",
        eval_id=baseline_eval.id,
        data_source=RUN_DATA_SOURCE,
    )
    print(f"> Created run {run.id}: {run.name}")
    RUNS.append(run)

print(f"> Created {len(RUNS)} runs.")

# Reinforcement Fine Tuning
Now let's train a model to improve upon our out-of-box o4-mini experience.

In [None]:
# Upload training and validation data.
with open("./data/countdown_train_100.jsonl", mode="rb") as f:
    training_file = client.files.create(file=f, purpose="fine-tune")
training_file = client.files.wait_for_processing(training_file.id)
print(f"> Uploaded training file {training_file.id}")

with open("./data/countdown_valid_50.jsonl", mode="rb") as f:
    validation_file = client.files.create(file=f, purpose="fine-tune")
validation_file = client.files.wait_for_processing(validation_file.id)
print(f"> Uploaded validation file {validation_file.id}")

We can drop the `pass_threshold` settings from the individual graders. We can set it on
the `multi_grader` we'll use to combine both Python and Endpoint graders.

In [None]:
# create our RFT job
import copy

# scrub pass thresholds.
RFT_ENDPOINT_GRADER = copy.deepcopy(ENDPOINT_GRADER)
if "pass_threshold" in RFT_ENDPOINT_GRADER:
    del RFT_ENDPOINT_GRADER["pass_threshold"]
RFT_PYTHON_GRADER = copy.deepcopy(PYTHON_GRADER)
if "pass_threshold" in RFT_PYTHON_GRADER:
    del RFT_PYTHON_GRADER["pass_threshold"]


MULTI_GRADER = {
    "type": "multi",
    "name": "Combined local and remote grader",
    "graders": {
        "python": PYTHON_GRADER,
        "endpoint": RFT_ENDPOINT_GRADER,
    },
    "calculate_output": "(python + endpoint) / 2",  # average the scores
    "pass_threshold": 5,
}

job = client.fine_tuning.jobs.create(
    suffix=f"dv-endpoint-countdown-ft-{UNIQUE_ENOUGH_KEY}",
    model="o4-mini-2025-04-16",
    training_file=training_file.id,
    validation_file=validation_file.id,
    method={
        "type": "reinforcement",
        "reinforcement": {
            # "grader": RFT_ENDPOINT_GRADER,
            # "grader": PYTHON_GRADER,
            "grader": MULTI_GRADER,
            "response_format": RESPONSE_SCHEMA,
        },
    },
)
print(f"> Created RFT job {job.id}")
print(job.to_json())