# Evaluating generation quality performance metrics of the flow

In [1]:
# Configuring Azure OpenAI service connection

import os
from dotenv import load_dotenv
load_dotenv()

# Initialize Azure OpenAI Connection
model_config = {
        "azure_deployment": "gpt-4",
        "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
        "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
        "api_key": os.environ["AZURE_OPENAI_KEY"]
}

In [2]:
# Uploading test dataset
import pandas as pd

test_data_path = "../data/test_dataset.jsonl"

df = pd.read_json(test_data_path, lines=True)
df.head()

Unnamed: 0,question
0,Create the website copy for the tents catalog ...
1,Create the textual assets for the sleeping bag...
2,Draft the website copy for the hiking shoes we...


In [3]:
# Importing class evaluators 
from create_website_copy_request import get_response
from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator, evaluate

relevance_evaluator = RelevanceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)

[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.


In [4]:
# Create unique id for each run with date and time
from datetime import datetime
run_id = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_id}_chat_evaluation_sdk"    
print(run_id)

20250506151528_chat_evaluation_sdk


In [5]:
subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"]
resource_group_name= os.environ["AZURE_RESOURCE_GROUP"]
project_name = os.environ["AZURE_AI_PROJECT_NAME"]

azure_ai_project = {
    "subscription_id": subscription_id,
    "resource_group_name": resource_group_name,
    "project_name": project_name
}

In [6]:
import json

def create_response_data(df):
    results = []

    for index, row in df.iterrows():
        question = row['question']
        
        # Run get response
        response = get_response(question)
                
        # Add results to list
        result = {
            'query': question,
            'context': response["context"],
            'response': response["answer"]
        }
        results.append(result)

    # Save results to a JSONL file
    with open('run_results.jsonl', 'w') as file:
        for result in results:
            file.write(json.dumps(result) + '\n')
    return results
     

In [7]:
# Step 1: Run web_designer_app against test dataset
# Step 2: Evaluate outputs (answer and context) against generation quality metrics
%pip install azure-ai-evaluation[remote]
response_results = create_response_data(df)
result_eval = evaluate(
    evaluation_name=run_id,
    data="run_results.jsonl",
    evaluators={
        "relevance": relevance_evaluator,
        "fluency": fluency_evaluator,
        "coherence": coherence_evaluator,
        "groundedness": groundedness_evaluator,
    },
    # column mapping    
    evaluator_config={
        "default": {
            "query": "${data.query}",
            "response": "${data.response}",
            "context": "${data.context}",
        },
    },
    azure_ai_project = azure_ai_project, # comment this line if you don't want to push results to your Azure AI Project
    output_path="./eval_results.jsonl"
)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
question: Create the website copy for the tents catalog page
embedding done
index name: products-catalog
context: [{'id': 'cHJvZHVjdF9pbmZvXzE1Lm1kMA==', 'title': 'Information about product item_number: 15', 'content': "# Information about product item_number: 15\n\n# Information about product item_number: 15\nSkyView 2-Person Tent, price $200,\n\n## Brand\nOutdoorLiving\n\n## Category\nTents\n\n## Features\n- Spacious interior comfortably accommodates two people\n- Durable and waterproof materials for reliable protection against the elements\n- Easy and quick setup with color-coded poles and intuitive design\n- Two large doors for convenient entry and exit\n- Vestibules provide extra storage space for gear\n- Mesh panels for enhanced ventilation and reduced condensation\n- Rainfly included for added weather protection\n- Freestanding design al

[2025-05-06 15:15:52 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_relevance_20250506_151552_027315, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_relevance_20250506_151552_027315/logs.txt
[2025-05-06 15:15:52 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_coherence_20250506_151552_028578, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_coherence_20250506_151552_028578/logs.txt
[2025-05-06 15:15:52 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_fluency_20250506_151552_028047, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_fluency_20250506_151552_028047/logs.txt
[2025-05-06 15:15:52 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_groundedness_20250506_151552_029751, log path: /home

result: **TrailWalker Hiking Shoes – Adventure Awaits!**

Step into the great outdoors with confidence in the TrailWalker Hiking Shoes from CONTOSO. Designed by TrekReady, these premium hiking shoes combine durability, comfort, and performance to elevate your outdoor adventures. Whether you're tackling rugged trails or exploring serene landscapes, TrailWalker has you covered.

### Key Features:
- **Durable & Waterproof**: Built to withstand tough terrains and unpredictable weather.
- **Exceptional Grip**: Traction outsole with multidirectional lugs for stability on any surface.
- **All-Day Comfort**: Cushioned insole, supportive midsole, and padded collar ensure a comfortable fit for long hikes.
- **Breathable Design**: Synthetic leather and mesh keep your feet cool and dry.
- **Lightweight Construction**: Reduce fatigue and hike farther with ease.
- **Quick-Lace System**: Adjust your fit effortlessly for a secure and snug feel.
- **Enhanced Protection**: Reinforced toe cap, heel, and 

-----Summarized Metrics-----
{'relevance.relevance': 4.666666666666667, 'relevance.gpt_relevance': 4.666666666666667, 'relevance.relevance_threshold': 3.0, 'fluency.fluency': 4.0, 'fluency.gpt_fluency': 4.0, 'fluency.fluency_threshold': 3.0, 'coherence.coherence': 5.0, 'coherence.gpt_coherence': 5.0, 'coherence.coherence_threshold': 3.0, 'groundedness.groundedness': 5.0, 'groundedness.gpt_groundedness': 5.0, 'groundedness.groundedness_threshold': 3.0, 'relevance.binary_aggregate': 1.0, 'fluency.binary_aggregate': 1.0, 'coherence.binary_aggregate': 1.0, 'groundedness.binary_aggregate': 1.0}
-----Tabular Result-----


In [8]:
eval_result = pd.DataFrame(result_eval["rows"])
print("-----Summarized Metrics-----")
print(result_eval["metrics"])
print("-----Tabular Result-----")
eval_result.head()

Unnamed: 0,inputs.query,inputs.context,inputs.response,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.relevance.relevance_result,outputs.relevance.relevance_threshold,outputs.fluency.fluency,outputs.fluency.gpt_fluency,...,outputs.coherence.gpt_coherence,outputs.coherence.coherence_reason,outputs.coherence.coherence_result,outputs.coherence.coherence_threshold,outputs.groundedness.groundedness,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,outputs.groundedness.groundedness_result,outputs.groundedness.groundedness_threshold,line_number
0,Create the website copy for the tents catalog ...,## Task\nYou serve as a web copywriter for the...,**Explore CONTOSO's Premium Tents Collection**...,4,4,The RESPONSE is a complete and detailed catalo...,pass,3,4,4,...,5,"The RESPONSE is well-structured, logically org...",pass,3,5,5,The RESPONSE is fully grounded and complete as...,pass,3,0
1,Create the textual assets for the sleeping bag...,## Task\nYou serve as a web copywriter for the...,**MountainDream Sleeping Bag – Your Ultimate O...,5,5,"The RESPONSE is comprehensive, directly addres...",pass,3,4,4,...,5,The RESPONSE provides a detailed and well-stru...,pass,3,5,5,The RESPONSE accurately reflects the CONTEXT b...,pass,3,1
2,Draft the website copy for the hiking shoes we...,## Task\nYou serve as a web copywriter for the...,**TrailWalker Hiking Shoes – Adventure Awaits!...,5,5,The RESPONSE fully addresses the QUERY with de...,pass,3,4,4,...,5,"The RESPONSE is well-structured, directly addr...",pass,3,5,5,"The RESPONSE is well-grounded in the CONTEXT, ...",pass,3,2


In [9]:
# Get the link to visualize eval results to Azure AI Studio
result_eval["studio_url"]

'https://ai.azure.com/build/evaluation/b396827e-393c-47fe-8851-68e9cbae1f15?wsid=/subscriptions/6415ebd4-1dd7-430f-bd4d-2f5e9419c1cd/resourceGroups/BRK441-7670_RG/providers/Microsoft.MachineLearningServices/workspaces/BRK441-7670-project'