# Evaluating generation quality performance metrics of the flow

In [1]:
# Configuring Azure OpenAI service connection

import os
from dotenv import load_dotenv
load_dotenv()

# Initialize Azure OpenAI Connection
model_config = {
        "azure_deployment": "gpt-4",
        "api_version": os.environ["AZURE_OPENAI_API_VERSION"],
        "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
        "api_key": os.environ["AZURE_OPENAI_KEY"]
}

In [2]:
# Uploading test dataset
import pandas as pd

test_data_path = "../data/test_dataset.jsonl"

df = pd.read_json(test_data_path, lines=True)
df.head()

Unnamed: 0,question
0,Create the website copy for the tents catalog ...
1,Create the textual assets for the sleeping bag...
2,Draft the website copy for the hiking shoes we...


In [3]:
# Importing class evaluators 
from create_website_copy_request import get_response
from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator, evaluate

relevance_evaluator = RelevanceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)

In [4]:
# Create unique id for each run with date and time
from datetime import datetime
run_id = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_id}_chat_evaluation_sdk"    
print(run_id)

20241011130828_chat_evaluation_sdk


In [5]:
subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"]
resource_group_name= os.environ["AZURE_RESOURCE_GROUP"]
project_name = os.environ["AZURE_AI_PROJECT_NAME"]

azure_ai_project = {
    "subscription_id": subscription_id,
    "resource_group_name": resource_group_name,
    "project_name": project_name
}

In [6]:
import json

def create_response_data(df):
    results = []

    for index, row in df.iterrows():
        question = row['question']
        
        # Run get response
        response = get_response(question)
                
        # Add results to list
        result = {
            'query': question,
            'context': response["context"],
            'response': response["answer"]
        }
        results.append(result)

    # Save results to a JSONL file
    with open('run_results.jsonl', 'w') as file:
        for result in results:
            file.write(json.dumps(result) + '\n')
    return results
     

In [None]:
# Step 1: Run web_designer_app against test dataset
# Step 2: Evaluate outputs (answer and context) against generation quality metrics
response_results = create_response_data(df)
result_eval = evaluate(
    evaluation_name=run_id,
    data="run_results.jsonl",
    evaluators={
        "relevance": relevance_evaluator,
        "fluency": fluency_evaluator,
        "coherence": coherence_evaluator,
        "groundedness": groundedness_evaluator,
    },
    # column mapping    
    evaluator_config={
        "default": {
            "query": "${data.query}",
            "response": "${data.response}",
            "context": "${data.context}",
        },
    },
    azure_ai_project = azure_ai_project, # comment this line if you don't want to push results to your Azure AI Project
    output_path="./eval_results.jsonl"
)

In [8]:
eval_result = pd.DataFrame(result_eval["rows"])
print("-----Summarized Metrics-----")
print(result_eval["metrics"])
print("-----Tabular Result-----")
eval_result.head()

-----Summarized Metrics-----
{'relevance.gpt_relevance': 5.0, 'fluency.gpt_fluency': 5.0, 'coherence.gpt_coherence': 4.666666666666667, 'groundedness.gpt_groundedness': 5.0}
-----Tabular Result-----


Unnamed: 0,inputs.query,inputs.context,inputs.response,outputs.relevance.gpt_relevance,outputs.fluency.gpt_fluency,outputs.coherence.gpt_coherence,outputs.groundedness.gpt_groundedness,line_number
0,Create the website copy for the tents catalog ...,## Task\nYou serve as a web copywriter for the...,# Explore Our Premium Tents Collection\n\nWelc...,5,5,5,5,0
1,Create the textual assets for the sleeping bag...,## Task\nYou serve as a web copywriter for the...,### CozyNights Sleeping Bag\n\n#### Price: $10...,5,5,4,5,1
2,Draft the website copy for the hiking shoes we...,## Task\nYou serve as a web copywriter for the...,# TrailWalker Hiking Shoes by CONTOSO\n\n## Di...,5,5,5,5,2


In [None]:
# Get the link to visualize eval results to Azure AI Studio
result_eval["studio_url"]