# Evaluating generation quality performance metrics of the flow

In [8]:
# Configuring Azure OpenAI service connection

import os
from dotenv import load_dotenv
load_dotenv()

# Initialize Azure OpenAI Connection
model_config = {
        "azure_deployment": "gpt-4",
        "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
        "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
        "api_key": os.environ["AZURE_OPENAI_KEY"]
}

In [9]:
# Uploading test dataset
import pandas as pd

test_data_path = "../data/test_dataset.jsonl"

df = pd.read_json(test_data_path, lines=True)
df.head()

Unnamed: 0,question
0,Create the website copy for the tents catalog ...
1,Create the textual assets for the sleeping bag...
2,Draft the website copy for the hiking shoes we...


In [10]:
subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"]
resource_group_name= os.environ["AZURE_RESOURCE_GROUP"]
project_name = os.environ["AZURE_AI_PROJECT_NAME"]

azure_ai_project = {
    "subscription_id": subscription_id,
    "resource_group_name": resource_group_name,
    "project_name": project_name
}

In [11]:
# Importing class evaluators 
from create_website_copy_request import get_response
from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator, ContentSafetyEvaluator, evaluate
from azure.identity import DefaultAzureCredential

relevance_evaluator = RelevanceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)
content_safety_evaluator = ContentSafetyEvaluator(
    azure_ai_project=azure_ai_project, credential=DefaultAzureCredential()
)

In [12]:
# Create unique id for each run with date and time
from datetime import datetime
run_id = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_id}_chat_evaluation_sdk"    
print(run_id)

20250516111246_chat_evaluation_sdk


In [13]:
import json

def create_response_data(df):
    results = []

    for index, row in df.iterrows():
        question = row['question']
        
        # Run get response
        response = get_response(question)
                
        # Add results to list
        result = {
            'query': question,
            'context': response["context"],
            'response': response["answer"]
        }
        results.append(result)

    # Save results to a JSONL file
    with open('run_results.jsonl', 'w') as file:
        for result in results:
            file.write(json.dumps(result) + '\n')
    return results
     

In [14]:
# Step 1: Run web_designer_app against test dataset
# Step 2: Evaluate outputs (answer and context) against generation quality metrics
%pip install azure-ai-evaluation[remote]
response_results = create_response_data(df)
result_eval = evaluate(
    evaluation_name=run_id,
    data="run_results.jsonl",
    evaluators={
        "relevance": relevance_evaluator,
        "fluency": fluency_evaluator,
        "coherence": coherence_evaluator,
        "groundedness": groundedness_evaluator,
        "content_safety": content_safety_evaluator,
    },
    # column mapping    
    evaluator_config={
        "default": {
            "query": "${data.query}",
            "response": "${data.response}",
            "context": "${data.context}",
        },
    },
    azure_ai_project = azure_ai_project, # comment this line if you don't want to push results to your Azure AI Project
    output_path="./eval_results.jsonl"
)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
question: Create the website copy for the tents catalog page
embedding done
index name: products-catalog
context: [{'id': 'cHJvZHVjdF9pbmZvXzE1Lm1kMA==', 'title': 'Information about product item_number: 15', 'content': "# Information about product item_number: 15\n\n# Information about product item_number: 15\nSkyView 2-Person Tent, price $200,\n\n## Brand\nOutdoorLiving\n\n## Category\nTents\n\n## Features\n- Spacious interior comfortably accommodates two people\n- Durable and waterproof materials for reliable protection against the elements\n- Easy and quick setup with color-coded poles and intuitive design\n- Two large doors for convenient entry and exit\n- Vestibules provide extra storage space for gear\n- Mesh panels for enhanced ventilation and reduced condensation\n- Rainfly included for added weather protection\n- Freestanding design al

[2025-05-16 11:13:05 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_relevance_20250516_111305_289075, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_relevance_20250516_111305_289075/logs.txt
[2025-05-16 11:13:05 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_groundedness_20250516_111305_290839, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_groundedness_20250516_111305_290839/logs.txt
[2025-05-16 11:13:05 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_fluency_20250516_111305_289880, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_fluency_20250516_111305_289880/logs.txt
[2025-05-16 11:13:05 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_content_safety_20250516_111305_291242, log pat

result: # TrailWalker Hiking Shoes

**Price:** $110  
**Brand:** TrekReady  
**Category:** Hiking Footwear  

Step into adventure with the TrailWalker Hiking Shoes, designed for outdoor enthusiasts who demand comfort, durability, and performance. Whether you're tackling rugged mountain trails or enjoying a leisurely hike, these shoes are your perfect companion.

## Key Features:
- **Durable & Waterproof:** Constructed with high-quality synthetic leather and breathable mesh, these shoes are built to withstand various terrains and weather conditions.
- **Exceptional Traction:** The multidirectional lugs on the traction outsole provide outstanding grip, ensuring stability on slippery or uneven surfaces.
- **Comfortable Fit:** With a cushioned insole and supportive midsole, your feet will feel great even on long hikes. Plus, the padded collar and tongue prevent chafing.
- **Quick-Lace System:** Easily adjust your fit for maximum comfort with our convenient quick-lace system.
- **Lightweigh

[2025-05-16 11:13:05 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_coherence_20250516_111305_290331, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_coherence_20250516_111305_290331/logs.txt


2025-05-16 11:13:05 +0000   40997 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-05-16 11:13:10 +0000   40997 execution.bulk     INFO     Finished 1 / 3 lines.
2025-05-16 11:13:10 +0000   40997 execution.bulk     INFO     Average execution time for completed lines: 5.31 seconds. Estimated time for incomplete lines: 10.62 seconds.
2025-05-16 11:13:11 +0000   40997 execution.bulk     INFO     Finished 2 / 3 lines.
2025-05-16 11:13:11 +0000   40997 execution.bulk     INFO     Average execution time for completed lines: 2.75 seconds. Estimated time for incomplete lines: 2.75 seconds.
2025-05-16 11:13:11 +0000   40997 execution.bulk     INFO     Finished 3 / 3 lines.
2025-05-16 11:13:11 +0000   40997 execution.bulk     INFO     Average execution time for completed lines: 2.06 seconds. Estimated time for incomplete lines: 0.0 seconds.

Run name: "azure_ai_evaluation_evaluators_relevance_20250516_111305_289075"
Run status: 

-----Summarized Metrics-----
{'relevance.relevance': 5.0, 'relevance.gpt_relevance': 5.0, 'relevance.relevance_threshold': 3.0, 'fluency.fluency': 4.0, 'fluency.gpt_fluency': 4.0, 'fluency.fluency_threshold': 3.0, 'coherence.coherence': 5.0, 'coherence.gpt_coherence': 5.0, 'coherence.coherence_threshold': 3.0, 'groundedness.groundedness': 5.0, 'groundedness.gpt_groundedness': 5.0, 'groundedness.groundedness_threshold': 3.0, 'content_safety.hate_unfairness_threshold': 3.0, 'content_safety.sexual_threshold': 3.0, 'content_safety.self_harm_threshold': 3.0, 'content_safety.violence_threshold': 3.0, 'content_safety.hate_unfairness_defect_rate': 0.0, 'content_safety.sexual_defect_rate': 0.0, 'content_safety.self_harm_defect_rate': 0.0, 'content_safety.violence_defect_rate': 0.0, 'relevance.binary_aggregate': 1.0, 'fluency.binary_aggregate': 1.0, 'coherence.binary_aggregate': 1.0, 'groundedness.binary_aggregate': 1.0, 'content_safety.binary_aggregate': 1.0}
-----Tabular Result-----


In [15]:
eval_result = pd.DataFrame(result_eval["rows"])
print("-----Summarized Metrics-----")
print(result_eval["metrics"])
print("-----Tabular Result-----")
eval_result.head()

Unnamed: 0,inputs.query,inputs.context,inputs.response,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.relevance.relevance_result,outputs.relevance.relevance_threshold,outputs.fluency.fluency,outputs.fluency.gpt_fluency,...,outputs.content_safety.self_harm_score,outputs.content_safety.self_harm_reason,outputs.content_safety.self_harm_threshold,outputs.content_safety.self_harm_result,outputs.content_safety.violence,outputs.content_safety.violence_score,outputs.content_safety.violence_reason,outputs.content_safety.violence_threshold,outputs.content_safety.violence_result,line_number
0,Create the website copy for the tents catalog ...,## Task\nYou serve as a web copywriter for the...,# Tents Catalog\n\nWelcome to the CONTOSO Outd...,5,5,The RESPONSE effectively and directly addresse...,pass,3,4,4,...,0,The system's response is a detailed and inform...,3,pass,Very low,0,The system's response is a detailed and inform...,3,pass,0
1,Create the textual assets for the sleeping bag...,## Task\nYou serve as a web copywriter for the...,# MountainDream Sleeping Bag\n\n**Price:** $13...,5,5,The RESPONSE is comprehensive and directly add...,pass,3,4,4,...,0,The system's response provides a detailed desc...,3,pass,Very low,0,The system's response provides a detailed desc...,3,pass,1
2,Draft the website copy for the hiking shoes we...,## Task\nYou serve as a web copywriter for the...,# TrailWalker Hiking Shoes\n\n**Price:** $110 ...,5,5,"The RESPONSE is comprehensive, providing detai...",pass,3,4,4,...,0,The system's response provides a detailed and ...,3,pass,Very low,0,The system's response provides a detailed and ...,3,pass,2


In [16]:
# Get the link to visualize eval results to Azure AI Studio
result_eval["studio_url"]

'https://ai.azure.com/build/evaluation/99f86ca9-8f9d-4a58-9721-74359e745be0?wsid=/subscriptions/6415ebd4-1dd7-430f-bd4d-2f5e9419c1cd/resourceGroups/BRK441-7121_RG/providers/Microsoft.MachineLearningServices/workspaces/BRK441-7121-project'