In [1]:
%pip install -r requirements.txt -qU

Note: you may need to restart the kernel to use updated packages.


In [2]:
import mlflow
import os
from getpass import getpass
import pandas as pd
from openai import OpenAI
from utils import get_random_files, get_image, get_json, _set_openai_api_key_for_demo

# Set MLflow tracking URI to cwd()
mlflow.set_tracking_uri(os.getcwd() + "/mlruns")
mlflow.set_experiment("evaluation")

# Get API key securely
if (not _set_openai_api_key_for_demo()) and (not os.getenv("OPENAI_API_KEY")):
    os.environ["OPENAI_API_KEY"] = getpass("Your OpenAI API Key: ")

2025/05/12 23:58:07 INFO mlflow.tracking.fluent: Experiment with name 'evaluation' does not exist. Creating a new experiment.


## Step 1 - Determine Accuracy

In [3]:
_files = get_random_files(n=5)
images = [get_image(file, encode_as_str=True) for file in _files]
jsons = [get_json(file) for file in _files]

mlflow.openai.autolog()
client = OpenAI()

In [4]:
system_prompt = """You are an expert at Optical Character Recognition (OCR). Extract the questions and answers from the image."""

def get_completion(inputs: str) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    { "type": "text", "text": "what's in this image?" },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{inputs}",
                        },
                    },
                ],
            }
        ],
    )

    return completion.choices[0].message.content

with mlflow.start_run() as run:
    predicted = get_completion(images[0])
    print(predicted)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The image is a document titled "Maverick Specials - Progress Report" from Lorillard Tobacco, dated December 12, 1996. It is addressed from F. Strickland to K. A. Sparrow. The submission date is marked as December 13.

Sections in the document include:

- **Geography**: Indicating that the report covers the full region.
- **Distribution**: Listing direct accounts and chains within the region with 15 or more stores not stocking Maverick specials.

The table includes:

| Name of Account | No. of Stores | Name of Account | No. of Stores |
|-----------------|---------------|-----------------|---------------|
| K&B             | 180           | Sayle Oil       | 20            |
| Delchamps       | 130           | Dantzler        | 19            |
| Winn Dixie      | 55            | Southeast Foods | 18            |
| Schwegmann      | 39            | Compass Foods   | 18            |
| Autry Greer     | 36            | Bayou Foods     | 17            |
| Double Quick    | 36            | Eco

In [5]:
jsons[0]

{'TO:': 'K. A. Sparrow',
 'FROM:': 'F. Strickland',
 'SUBJECT:': 'MAVERICK SPECIALS- PROGRESS REPORT',
 'FULL': 'x',
 'DEC 13': 'X',
 'NAME OF ACCOUNT': 'Econ',
 'NO. OF STORES': '22',
 'NO OF STORES': '16'}

In [6]:
predicted == jsons[0]

False

## Step 2: MLflow Evaluate

In [7]:
%pip install tiktoken aiohttp -qU

Note: you may need to restart the kernel to use updated packages.


In [8]:
correct_format = mlflow.metrics.genai.make_genai_metric(
    name="correct_format",
    definition=(
        """The answer is a list of dicts where keys are `question` and `answer`."""
    ),
    grading_prompt=(
        """If formatted correctly, return 1. Otherwise, return 0."""
    ),
    model="openai:/gpt-4o-mini",
    greater_is_better=True,
)

def batch_completion(df: pd.DataFrame) -> list[str]:
    return [get_completion(image) for image in df["inputs"]]

eval_result = mlflow.evaluate(
    model=batch_completion,
    data=pd.DataFrame({"inputs": images, "truth": jsons}),
    targets="truth",
    model_type="text",
    extra_metrics=[correct_format],
)

2025/05/12 23:58:21 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/05/12 23:58:21 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025/05/12 23:58:57 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
100%|██████████| 5/5 [0

In [9]:
eval_result.tables['eval_results_table']

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 866.59it/s] 
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 685.90it/s] 


Unnamed: 0,inputs,truth,outputs,token_count,correct_format/v1/score,correct_format/v1/justification
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'TO:': 'K. A. Sparrow', 'FROM:': 'F. Strickla...","This image is a progress report titled ""MAVERI...",346,0,The output does not follow the required format...
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'DATE:': '8/ 10/ 90', 'MANUFACTURER:': 'B & W...","The image is a document titled ""NEW COMPETITIV...",119,0,The output does not follow the required format...
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'TO:': 'KELLI SCRUGGS', 'FROM:': 'LEONARD JON...","The image is a ""Coupon Code Registration Form""...",207,0,The output does not follow the required format...
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'Name:': 'Andy McGaan', 'No': '☑', 'Telecopie...",The image is a fax cover sheet from Hunton & W...,350,0,The output does not follow the required format...
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'TO:': 'K. A. Sparrow', 'FROM:': 'D. J. Landr...",The image is a progress report document. Here'...,235,0,The output does not match the required format ...


In [10]:
print(eval_result.metrics)

{'correct_format/v1/mean': np.float64(0.0), 'correct_format/v1/variance': np.float64(0.0), 'correct_format/v1/p90': np.float64(0.0)}


## Step 3: Go to the UI 