# Evaluating generation quality performance metrics of the flow

In [1]:
# Configuring Azure OpenAI service connection

import os
from dotenv import load_dotenv
load_dotenv()

# Initialize Azure OpenAI Connection
model_config = {
        "azure_deployment": "gpt-4",
        "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
        "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
        "api_key": os.environ["AZURE_OPENAI_KEY"]
}

In [2]:
# Uploading test dataset
import pandas as pd

test_data_path = "../data/test_dataset.jsonl"

df = pd.read_json(test_data_path, lines=True)
df.head()

Unnamed: 0,question
0,Create the website copy for the tents catalog ...
1,Create the textual assets for the sleeping bag...
2,Draft the website copy for the hiking shoes we...


In [3]:
# Importing class evaluators 
from create_website_copy_request import get_response
from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator, evaluate

relevance_evaluator = RelevanceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)

In [4]:
# Create unique id for each run with date and time
from datetime import datetime
run_id = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_id}_chat_evaluation_sdk"    
print(run_id)

20241127064509_chat_evaluation_sdk


In [5]:
subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"]
resource_group_name= os.environ["AZURE_RESOURCE_GROUP"]
project_name = os.environ["AZURE_AI_PROJECT_NAME"]

azure_ai_project = {
    "subscription_id": subscription_id,
    "resource_group_name": resource_group_name,
    "project_name": project_name
}

In [6]:
import json

def create_response_data(df):
    results = []

    for index, row in df.iterrows():
        question = row['question']
        
        # Run get response
        response = get_response(question)
                
        # Add results to list
        result = {
            'query': question,
            'context': response["context"],
            'response': response["answer"]
        }
        results.append(result)

    # Save results to a JSONL file
    with open('run_results.jsonl', 'w') as file:
        for result in results:
            file.write(json.dumps(result) + '\n')
    return results
     

In [7]:
# Step 1: Run web_designer_app against test dataset
# Step 2: Evaluate outputs (answer and context) against generation quality metrics
%pip install azure-ai-evaluation[remote]
response_results = create_response_data(df)
result_eval = evaluate(
    evaluation_name=run_id,
    data="run_results.jsonl",
    evaluators={
        "relevance": relevance_evaluator,
        "fluency": fluency_evaluator,
        "coherence": coherence_evaluator,
        "groundedness": groundedness_evaluator,
    },
    # column mapping    
    evaluator_config={
        "default": {
            "query": "${data.query}",
            "response": "${data.response}",
            "context": "${data.context}",
        },
    },
    azure_ai_project = azure_ai_project, # comment this line if you don't want to push results to your Azure AI Project
    output_path="./eval_results.jsonl"
)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
question: Create the website copy for the tents catalog page
embedding done
index name: products-catalog
context: [{'id': 'cHJvZHVjdF9pbmZvXzgubWQw', 'title': 'Information about product item_number: 8', 'content': '# Information about product item_number: 8\n\n# Information about product item_number: 8\nAlpine Explorer Tent, price $350,\n\n## Brand\nAlpineGear\n\n## Category\nTents\n\n### Features\n- Waterproof: Provides reliable protection against rain and moisture.\n- Easy Setup: Simple and quick assembly process, making it convenient for camping.\n- Room Divider: Includes a detachable divider to create separate living spaces within the tent.\n- Excellent Ventilation: Multiple mesh windows and vents promote airflow and reduce condensation.\n- Gear Loft: Built-in gear loft or storage pockets for organizing and storing camping gear.\n\n## Techn

[2024-11-27 06:46:23 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_e9uxcbas_20241127_064623_533870, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_e9uxcbas_20241127_064623_533870/logs.txt
[2024-11-27 06:46:23 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_blpblndj_20241127_064623_536579, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_blpblndj_20241127_064623_536579/logs.txt
[2024-11-27 06:46:23 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_86jx7imv_20241127_064623_531613, log path: /home/vscode/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_86jx7i

result: **CONTOSO TrailWalker Hiking Shoes**

Embark on your next adventure with confidence in the CONTOSO TrailWalker Hiking Shoes. Designed by TrekReady, these shoes are the perfect companion for hiking enthusiasts seeking durability, comfort, and performance.

**Key Features:**
- **Durable & Waterproof:** Withstand various terrains and weather conditions with our high-quality synthetic leather and mesh construction.
- **Superior Traction:** Navigate challenging trails effortlessly with a traction outsole featuring multidirectional lugs.
- **Enhanced Comfort:** Enjoy long hikes with a cushioned insole, supportive midsole, and padded collar and tongue.
- **Lightweight Design:** Reduce fatigue on extended adventures with our lightweight construction.
- **Quick-Lace System:** Easily adjust for a secure fit with our convenient quick-lace system.
- **Reflective Accents:** Stay visible in low-light conditions for added safety.

**Technical Specs:**
- **Best Use:** Hiking
- **Upper Material

[2024-11-27 06:46:24 +0000][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 60 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2024-11-27 06:46:24 +0000][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 60 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2024-11-27 06:46:24 +0000][promptflow.core._prompty_utils][ERROR] -

2024-11-27 06:46:23 +0000   19297 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-11-27 06:46:26 +0000   19297 execution.bulk     INFO     Finished 1 / 3 lines.
2024-11-27 06:46:26 +0000   19297 execution.bulk     INFO     Average execution time for completed lines: 3.24 seconds. Estimated time for incomplete lines: 6.48 seconds.
2024-11-27 06:46:27 +0000   19297 execution.bulk     INFO     Finished 2 / 3 lines.
2024-11-27 06:46:27 +0000   19297 execution.bulk     INFO     Average execution time for completed lines: 1.7 seconds. Estimated time for incomplete lines: 1.7 seconds.
2024-11-27 06:46:27 +0000   19297 execution.bulk     INFO     Finished 3 / 3 lines.
2024-11-27 06:46:27 +0000   19297 execution.bulk     INFO     Average execution time for completed lines: 1.22 seconds. Estimated time for incomplete lines: 0.0 seconds.

Run name: "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_e9uxcbas_2024

In [8]:
eval_result = pd.DataFrame(result_eval["rows"])
print("-----Summarized Metrics-----")
print(result_eval["metrics"])
print("-----Tabular Result-----")
eval_result.head()

-----Summarized Metrics-----
{'relevance.relevance': 5.0, 'relevance.gpt_relevance': 5.0, 'fluency.fluency': 4.0, 'fluency.gpt_fluency': 4.0, 'coherence.coherence': 5.0, 'coherence.gpt_coherence': 5.0, 'groundedness.groundedness': 5.0, 'groundedness.gpt_groundedness': 5.0}
-----Tabular Result-----


Unnamed: 0,inputs.query,inputs.context,inputs.response,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,outputs.fluency.fluency,outputs.fluency.gpt_fluency,outputs.fluency.fluency_reason,outputs.coherence.coherence,outputs.coherence.gpt_coherence,outputs.coherence.coherence_reason,outputs.groundedness.groundedness,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,line_number
0,Create the website copy for the tents catalog ...,## Task\nYou serve as a web copywriter for the...,Welcome to the CONTOSO Outdoor Tents Collectio...,5,5,The RESPONSE is directly relevant to the QUERY...,4,4,"The RESPONSE is well-articulated, uses a varie...",5,5,"The RESPONSE is well-structured, with each ten...",5,5,The RESPONSE accurately reflects the CONTEXT b...,0
1,Create the textual assets for the sleeping bag...,## Task\nYou serve as a web copywriter for the...,Welcome to CONTOSO's Sleeping Bags Collection!...,5,5,The RESPONSE effectively addresses the QUERY b...,4,4,"The RESPONSE is well-articulated, with no gram...",5,5,"The RESPONSE is well-organized, directly addre...",5,5,The RESPONSE is fully grounded and complete as...,1
2,Draft the website copy for the hiking shoes we...,## Task\nYou serve as a web copywriter for the...,**CONTOSO TrailWalker Hiking Shoes**\n\nEmbark...,5,5,The RESPONSE fully addresses the QUERY with co...,4,4,"The RESPONSE is well-articulated, uses appropr...",5,5,"The RESPONSE is well-structured, with clear se...",5,5,"The RESPONSE is fully grounded and complete, a...",2


In [9]:
# Get the link to visualize eval results to Azure AI Studio
result_eval["studio_url"]

'https://ai.azure.com/build/evaluation/f28969a3-c3c4-4536-a009-f63aad01e24d?wsid=/subscriptions/6fe5cf97-a734-4ca7-ac31-5bf9d70e9a26/resourceGroups/contoso-website-designer/providers/Microsoft.MachineLearningServices/workspaces/contoso-website-designer'