In [24]:
%pip install -r requirements.txt -qU

Note: you may need to restart the kernel to use updated packages.


In [27]:
import mlflow
import os
from getpass import getpass
import pandas as pd
from openai import OpenAI
from utils import get_random_files, get_image, get_json, _set_openai_api_key_for_demo

# Set MLflow tracking URI to cwd()
mlflow.set_tracking_uri(os.getcwd() + "/mlruns")
mlflow.set_experiment("evaluation_with_prompt_registry")

# Get API key securely
if (not _set_openai_api_key_for_demo()) and (not os.getenv("OPENAI_API_KEY")):
    os.environ["OPENAI_API_KEY"] = getpass("Your OpenAI API Key: ")

2025/05/12 23:51:52 INFO mlflow.tracking.fluent: Experiment with name 'evaluation_with_prompt_registry' does not exist. Creating a new experiment.


In [28]:
_files = get_random_files(n=5)
images = [get_image(file, encode_as_str=True) for file in _files]
jsons = [get_json(file) for file in _files]

mlflow.openai.autolog()
client = OpenAI()

## Prompt Engineer: Improve the Prompt

In [29]:
import mlflow

new_template = """\
You are an expert at key information extraction and OCR.

Format as a list of dictionaries as shown below. They keys should only be `question` and `answer`. 

```
[
    {
        "question": "question field",
        "answer": "answer to question field"

    },
...
]
```

Question refers to a field in the form that takes in information. Answer refers to the information 
that is filled in the field.

Follow these rules:
- Only use the information present in the text.
{{ additional_rules }}
"""

# Register a new version of an existing prompt
updated_prompt = mlflow.register_prompt(
    name="ocr-question-answer",
    template=new_template,
    version_metadata={
        "author": "author@example.com",
    },
)

updated_prompt

Prompt(name=ocr-question-answer, version=1, template=You are an expert at key infor...)

## ML Engineer: Use the Prompt

In [30]:
prompt = mlflow.load_prompt("prompts:/ocr-question-answer/latest")
prompt

Prompt(name=ocr-question-answer, version=1, template=You are an expert at key infor...)

In [31]:
def get_completion(inputs: str) -> str:
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system", 
                "content": prompt.format( # Add system prompt here
                    additional_rules="Use exact formatting you see in the form."
                )
            },
            {
                "role": "user",
                "content": [
                    { "type": "text", "text": "what's in this image?" },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{inputs}",
                        },
                    },
                ],
            }
        ],
    )

    return completion.choices[0].message.content

with mlflow.start_run() as run:
    predicted = get_completion(images[0])
    print(predicted)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


```json
[
    {
        "question": "CASE NAME",
        "answer": "Donald D. Sellers and Robin J. Sellers v. Raybestos-Manhattan, et al."
    },
    {
        "question": "COURT",
        "answer": "San Francisco Superior Court - No. 996382"
    },
    {
        "question": "LORILLARD ENTITIES",
        "answer": "Lorillard Tobacco Company"
    },
    {
        "question": "DATE FILED",
        "answer": ""
    },
    {
        "question": "DATE SERVED",
        "answer": "August 3, 1998"
    },
    {
        "question": "CASE TYPE",
        "answer": "Asbestos"
    },
    {
        "question": "PLAINTIFF'S COUNSEL",
        "answer": "Wartnick, Chaber, Harowitz, Smith & Tigerman\nStephen M. Tigerman\n101 California Street, Suite 2200\nSan Francisco, California 94111\n415/986-5566"
    },
    {
        "question": "LORILLARD COUNSEL",
        "answer": ""
    },
    {
        "question": "JUDGE",
        "answer": ""
    },
    {
        "question": "TRIAL DATE",
        "answer": ""


### Format

In [32]:
correct_format = mlflow.metrics.genai.make_genai_metric(
    name="correct_format",
    definition=(
        """The answer is a list of dicts where keys are `question` and `answer`."""
    ),
    grading_prompt=(
        """If formatted correctly, return 1. Otherwise, return 0."""
    ),
    model="openai:/gpt-4o-mini",
    greater_is_better=True,
)

def batch_completion(df: pd.DataFrame) -> list[str]:
    return [get_completion(image) for image in df["inputs"]]

eval_result = mlflow.evaluate(
    model=batch_completion,
    data=pd.DataFrame({"inputs": images, "truth": jsons}),
    targets="truth",
    model_type="text",
    extra_metrics=[correct_format],
)

2025/05/12 23:53:28 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025/05/12 23:54:09 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:01<00:00,  1.75s/it]
100%|██████████| 5/5 [00:03<00:00,  1.62it/s]


In [33]:
eval_result.tables['eval_results_table']

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 585.14it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 558.94it/s]


Unnamed: 0,inputs,truth,outputs,token_count,correct_format/v1/score,correct_format/v1/justification
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,{'COURT:': 'San Francisco Superior Court- No. ...,The image is a case form document. Here are th...,302,1,The output is correctly formatted as a list of...
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'': 'J. D. Ergle and R. F. Dufresne', 'DATE':...","The image contains a form titled ""DECISION TRE...",193,1,The output is correctly formatted as a list of...
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'DATE:': '8/ 10/ 90', 'MANUFACTURER:': 'B & W...","```json\n[\n {\n ""question"": ""REPORT...",268,1,The output is correctly formatted as a list of...
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'BRAND': 'STYLE SLIM MEN. LT. 100's', 'NOTE:'...","The image is a ""Direct Account Status Report"" ...",509,1,The output is correctly formatted as a list of...
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDABQODxIPDRQSEB...,"{'TO:': 'George Baroody', 'DATE:': '12 /10 /98...","```json\n[\n {\n ""question"": ""TO"",\n...",178,1,The output is correctly formatted as a list of...


In [34]:
print(eval_result.metrics)

{'correct_format/v1/mean': np.float64(1.0), 'correct_format/v1/variance': np.float64(0.0), 'correct_format/v1/p90': np.float64(1.0)}
