In [None]:
from dotenv import load_dotenv
from scripts.eval_utils import AsyncEvalClient

# Load environment variables
load_dotenv()

# Setup evaluation client
client = AsyncEvalClient()
print("ðŸŽ‰ Azure OpenAI Evaluation Client ready!")

IMAGE_MODEL_DEPLOYMENT_NAME = "gpt-4o-mini"

In [None]:
from scripts.image_utils import load_and_create_image_dataset, display_items

# Load dataset from Hugging Face and create the image dataset for evaluation
load_and_create_image_dataset("google-research-datasets/conceptual_captions")

# Display the created evaluation file
display_items(3)


In [None]:
# Upload the evaluation file to Azure OpenAI
eval_file_id = await client.upload_file(
    file_name="image_emotion_evaluation.jsonl",
    file_path="./data/image_emotion_evaluation.jsonl")
print(f"âœ… Eval file ID: {eval_file_id}")

In [None]:
score_model = {
      "type": "score_model",
      "name": "Image to Text Grader",
      "model": IMAGE_MODEL_DEPLOYMENT_NAME,
      "input": [
        {
          "role": "system",
          "content": "You are an expert grader. Judge how well the model response {{sample.output_text}} describes the image as well as matches the caption {{item.caption}}. Output a score of 1 if its an excelent match with both. If it's somewhat compatible, output a score around 0.5. Otherwise, give a score of 0."
        },
        {
            "role": "user",
            "content": [
                { 
                    "type": "input_text", 
                    "text": "Caption: {{ item.caption }}"
                },
                { 
                    "type": "input_image", 
                    "image_url": "{{ item.image_url }}"
                }
            ]
        }
      ],
      "range": [
        0,
        1
      ],
      "pass_threshold": 0.5
    }

eval_id = await client.create_eval_sdk(
    name="Image Caption Evaluation",
    testing_criteria=[score_model],
    data_source_config={
    "type": "custom",
    "item_schema": {
      "type": "object",
      "properties": {
        "image_url": {
          "type": "string",
          "description": "The URL of the image to be evaluated."
        },
        "caption": {
          "type": "string",
          "description": "The caption describing the image."
        }
      },
      "required": [
        "image_url",
        "caption"
      ]
    },
    "include_sample_schema": True,
  })

In [None]:
data_source = {
    "type": "completions",
    "model": IMAGE_MODEL_DEPLOYMENT_NAME,
    "sampling_params": {
      "temperature": 0.8
    },
    "source": {
      "type": "file_id",
      "id": eval_file_id
    },
    "input_messages": {
      "type": "template",
      "template": [
        {
          "role": "system",
          "content": "You are an assistant that analyzes images and provides captions that accurately describe the content of the image."
        },
        {
          "role": "user",
          "type": "message",
          "content": {
              "type": "input_image",
              "image_url": "{{ item.image_url }}",
              "detail": "auto"
          }
        }
      ]
    }
}

run = await client.create_eval_run_sdk(eval_id, "Image Caption Evaluation", data_source)
run_id = run['id']


In [None]:
import asyncio
import pandas as pd

while True:
    run = await client.get_eval_run_sdk(eval_id=eval_id, run_id=run_id)
    if run['status'] == "completed":
        output_items_response = await client.get_eval_run_output_items_sdk(
            eval_id=eval_id, run_id=run_id)

        # Get the actual list of items from the response object
        output_items = output_items_response.data if hasattr(output_items_response, 'data') else output_items_response
        
        print("Sample output item:")
        print(output_items[0])

        # Create DataFrame with safe access to nested fields
        df_data = {
            "id": [],
            "grading_results": [],
            "expected_caption": [],
            "model_response": []
        }

        for item in output_items:
            # Convert Pydantic model to dict if needed
            item_dict = item.model_dump() if hasattr(item, 'model_dump') else item
            
            df_data["id"].append(item_dict.get("id", "N/A"))
            df_data["grading_results"].append(item_dict.get("status", "N/A"))
            
            # Safely get expected caption
            datasource_item = item_dict.get('datasource_item', {})
            df_data["expected_caption"].append(datasource_item.get("caption", "N/A"))
            
            # Check if audio output exists
            sample = item_dict.get("sample", {})
            output = sample.get("output", {})
            output_transcript = output[0].get("content")
            df_data["model_response"].append(output_transcript)
        
        df = pd.DataFrame(df_data)
        display(df)
        break
    if run['status'] == "failed":
        print("Evaluation run failed:")
        print(run.get('error', 'Unknown error'))
        break
    print(f"Status: {run['status']}. Waiting...")
    await asyncio.sleep(5)
