In [1]:
%pip install -r requirements.txt -qU

Note: you may need to restart the kernel to use updated packages.


In [12]:
import mlflow
import os
from getpass import getpass
import pandas as pd
from openai import OpenAI
from utils import get_random_files, get_image, get_json, _set_openai_api_key_for_demo

# Set MLflow tracking URI to cwd()
mlflow.set_tracking_uri(os.getcwd() + "/mlruns")
mlflow.set_experiment("evaluation")

# Get API key securely
if (not _set_openai_api_key_for_demo()) and (not os.getenv("OPENAI_API_KEY")):
    os.environ["OPENAI_API_KEY"] = getpass("Your OpenAI API Key: ")

## Step 1 - Determine Accuracy

In [3]:
_files = get_random_files(n=5)
images = [get_image(file, encode_as_str=True) for file in _files]
jsons = [get_json(file) for file in _files]

mlflow.openai.autolog()
client = OpenAI()

In [30]:
system_prompt = """You are an expert at Optical Character Recognition (OCR). Extract the questions and answers from the image."""

def get_completion(inputs: str) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    { "type": "text", "text": "what's in this image?" },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{inputs}",
                        },
                    },
                ],
            }
        ],
    )

    return completion.choices[0].message.content

with mlflow.start_run() as run:
    predicted = get_completion(images[0])
    print(predicted)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The image is a "Corrected Form" from the Commonwealth of Massachusetts, Office of Campaign and Political Finance. It's a Report of Corporate Treasurer (Form CPF 22), formerly CPF 10.

Here are some key details from the document:

- **Name of Treasurer:** M. Alfred Peterson; Assistant Treasurer: Peter J. Marzullo
- **Name of Corporation:** Lorillard Tobacco Company
- **Address of Corporation:** One Park Avenue, New York, NY 10016-5895
- **Question No. 1:** Related to Tobacco Excise Tax Increase
- **Submitted to the voters on:** November 3, 1992, in Massachusetts
- **Reporting Period:** October 16, 1992, to November 5, 1992
- **Signed by:** Peter J. Marzullo, Assistant Treasurer, dated March 30, 1993

**Expenditures or Disbursements:**
- **10/26/92:** Committee Against Unfair Taxes, P.O. Box 5979, Boston, MA 02114, for opposing Tax Increase, Amount: $28,687.00
- **10/30/92:** Committee Against Unfair Taxes, P.O. Box 5979, Boston, MA 02114, for opposing Tax Increase, Amount: $19,603.00

*

In [31]:
jsons[0]

{'*Note:': 'If this expenditure is made to influence a local ballot question, a copy of this form should be filed with the city or town clerk or election commission.',
 '19': '92',
 'Address': 'P. O. Box 5979. Boston, MA 02114',
 'Date': 'March 30, 1993',
 'Purpose': 'Oppose Tax increas',
 'Final': '☑',
 'File with:': 'Director, Office of Campaign & Political Finance One Ashburton Place, Boston, MA 02108',
 '1. Name of Treasurer': 'M. Alfred Peterson; Assistant Treasurer Peter J. Marzullo',
 '2. Name of Corporation': 'Lorillard Tobacco Company',
 '3. Address of Corporation': 'One Park Avenue New York. NY 10016 -5895',
 '4. Question No.': '1 relating to Tobacco Excise Tax Increase',
 'submitted to the voters on': '1992',
 'on the': 'Massachusetts',
 'Reporting Period Beginning': 'October 16,',
 'and Ending': 'November 5. 1992',
 'Signed under the penalties of perjury.': '',
 'Date Paid': '10/ 30/ 92',
 'To Whom Paid (Alphabetical Listing Mandatory)': 'Committee Against Unfair Taxes',
 '

In [32]:
predicted == jsons[0]

False

## Step 2: MLflow Evaluate

In [35]:
%pip install tiktoken aiohttp -qU

Note: you may need to restart the kernel to use updated packages.


In [39]:
from mlflow.metrics.genai import answer_correctness

correct_format = mlflow.metrics.genai.make_genai_metric(
    name="correct_format",
    definition=(
        """The answer is a list of dicts where keys are `question` and `answer`."""
    ),
    grading_prompt=(
        """If formatted correctly, return 1. Otherwise, return 0."""
    ),
    model="openai:/gpt-4o-mini",
    greater_is_better=True,
)

def batch_completion(df: pd.DataFrame) -> list[str]:
    return [get_completion(image) for image in df["inputs"]]

eval_result = mlflow.evaluate(
    model=batch_completion,
    data=pd.DataFrame({"inputs": images, "truth": jsons}),
    targets="truth",
    model_type="text",
    extra_metrics=[correct_format],
)

2025/05/12 23:22:37 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025/05/12 23:23:15 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:04<00:00,  4.69s/it]
100%|██████████| 5/5 [00:04<00:00,  1.20it/s]


In [40]:
eval_result.tables['eval_results_table']

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1131.46it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 807.06it/s] 


Unnamed: 0,inputs,truth,outputs,token_count,correct_format/v1/score,correct_format/v1/justification
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,{'*Note:': 'If this expenditure is made to inf...,"This image is a form titled ""REPORT OF CORPORA...",386,0,The output does not match the required format ...
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'Name:': 'Andy McGaan', 'No': '☑', 'Telecopie...",The image is a fax cover sheet from Hunton & W...,87,0,The output does not match the required format ...
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,{},"The image is a ""Cigarette Report Form"" for col...",142,0,The output does not follow the required format...
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'TO:': 'Sam Zolot', 'FROM:': 'D. J. Landro', ...","The image is a ""Competitive Product Introducti...",226,0,The output does not follow the required format...
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'': 'J. D. Ergle and R. F. Dufresne', 'DATE':...","The image is a document titled ""Decision Tree ...",149,0,The output does not match the required format ...


In [41]:
print(eval_result.metrics)

{'correct_format/v1/mean': np.float64(0.0), 'correct_format/v1/variance': np.float64(0.0), 'correct_format/v1/p90': np.float64(0.0)}


## Step 3: Go to the UI 