# Evaluating generation quality performance metrics of the flow

In [1]:
# Configuring Azure OpenAI service connection

import os
from dotenv import load_dotenv
load_dotenv()
from promptflow.core import AzureOpenAIModelConfiguration

# Initialize Azure OpenAI Connection
model_config = AzureOpenAIModelConfiguration(
        azure_deployment="gpt-4",
        api_version=os.environ["AZURE_OPENAI_API_VERSION"],
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        api_key=os.environ["AZURE_OPENAI_KEY"]
    )

In [2]:
# Uploading test dataset
import pandas as pd

data_path = "../data/test_dataset.jsonl"

df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,question
0,Create the website copy for the tents catalog ...
1,Create the textual assets for the sleeping bag...
2,Draft the website copy for the hiking shoes we...


In [3]:
# Importing class evaluators 

from create_website_copy_request import get_response
from promptflow.evals.evaluators import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator

relevance_evaluator = RelevanceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)

In [4]:
# Create unique id for each run with date and time
from datetime import datetime
run_id = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_id}_chat_evaluation_sdk"    
print(run_id)

20240905133415_chat_evaluation_sdk


In [5]:
subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"]
resource_group_name= os.environ["AZURE_RESOURCE_GROUP"]
project_name = os.environ["AZURE_AI_PROJECT_NAME"]

azure_ai_project = {
    "subscription_id": subscription_id,
    "resource_group_name": resource_group_name,
    "project_name": project_name
}

In [None]:
# Step 1: Run web_designer_app against test dataset
# Step 2: Evaluate prompt flow outputs (answer and context) against generation quality metrics
from promptflow.evals.evaluate import evaluate

result_eval = evaluate(
    evaluation_name=run_id,
    data=data_path,
    target=get_response,
    evaluators={
        "relevance": relevance_evaluator,
        "fluency": fluency_evaluator,
        "coherence": coherence_evaluator,
        "groundedness": groundedness_evaluator,
    },
    # column mapping    return {"question": question, "answer": result, "context": context}
    evaluator_config={
        "defaultS": {
            "question": "${data.question}",
            "answer": "${target.answer}",
            "context": "${target.context}",
        },
    },
    azure_ai_project = azure_ai_project, # comment this line if you don't want to push results to your Azure AI Project
    output_path="./eval_results.jsonl"
)

In [7]:
eval_result = pd.DataFrame(result_eval["rows"])
print("-----Summarized Metrics-----")
print(result_eval["metrics"])
print("-----Tabular Result-----")
eval_result.head()

-----Summarized Metrics-----
{'relevance.gpt_relevance': 5.0, 'fluency.gpt_fluency': 5.0, 'coherence.gpt_coherence': 5.0, 'groundedness.gpt_groundedness': 5.0}
-----Tabular Result-----


Unnamed: 0,outputs.answer,outputs.context,inputs.question,outputs.relevance.gpt_relevance,outputs.fluency.gpt_fluency,outputs.coherence.gpt_coherence,outputs.groundedness.gpt_groundedness,line_number
0,## Tents Catalog\n\n### TrailMaster X4 Tent\n*...,## Task\nYou serve as a web copywriter for the...,Create the website copy for the tents catalog ...,5.0,5.0,5.0,5.0,0
1,### CozyNights Sleeping Bag\n\n**Embrace the G...,## Task\nYou serve as a web copywriter for the...,Create the textual assets for the sleeping bag...,5.0,5.0,5.0,5.0,1
2,### TrekReady TrailWalker Hiking Shoes\n\n**Ad...,## Task\nYou serve as a web copywriter for the...,Draft the website copy for the hiking shoes we...,5.0,5.0,5.0,5.0,2


In [8]:
# Get the link to visualize eval results to Azure AI Studio
result_eval["studio_url"]

'https://ai.azure.com/build/evaluation/web_designer_app_20240905_133422_646778?wsid=/subscriptions/7566e977-87fc-4b9b-929f-f094d9e33427/resourceGroups/BRK441-2820_RG/providers/Microsoft.MachineLearningServices/workspaces/BRK441-2820-project'