# Evaluating generation quality performance metrics of the flow

In [7]:
# Configuring Azure OpenAI service connection

import os
from dotenv import load_dotenv
load_dotenv()
from promptflow.core import AzureOpenAIModelConfiguration

# Initialize Azure OpenAI Connection
model_config = AzureOpenAIModelConfiguration(
        azure_deployment="gpt-4",
        api_version=os.environ["AZURE_OPENAI_API_VERSION"],
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        api_key=os.environ["AZURE_OPENAI_KEY"]
    )

In [8]:
# Uploading test dataset
import pandas as pd

data_path = "../data/test_dataset.jsonl"

df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,question
0,Create the website copy for the tents catalog ...
1,Create the textual assets for the sleeping bag...
2,Draft the website copy for the hiking shoes we...


In [9]:
# Importing class evaluators 

from create_website_copy_request import get_response
from promptflow.evals.evaluators import RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, CoherenceEvaluator

relevance_evaluator = RelevanceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)

In [10]:
# Create unique id for each run with date and time
from datetime import datetime
run_id = datetime.now().strftime("%Y%m%d%H%M%S")
run_id = f"{run_id}_chat_evaluation_sdk"    
print(run_id)

20240826163232_chat_evaluation_sdk


In [11]:
# Step 1: Run web_designer_flow against test dataset
# Step 2: Evaluate prompt flow outputs (answer and context) against generation quality metrics
from promptflow.evals.evaluate import evaluate

result_eval = evaluate(
    evaluation_name=run_id,
    data=data_path,
    target=get_response,
    evaluators={
        "relevance": relevance_evaluator,
        "fluency": fluency_evaluator,
        "coherence": coherence_evaluator,
        "groundedness": groundedness_evaluator,
    },
    # column mapping    return {"question": question, "answer": result, "context": context}
    evaluator_config={
        "defaultS": {
            "question": "${data.question}",
            "answer": "${target.answer}",
            "context": "${target.context}",
        },
    },
)



Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23334/v1.0/ui/traces/?#run=web_designer_flex_flow_20240826_163232_875744


[2024-08-26 16:32:32 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run web_designer_flex_flow_20240826_163232_875744, log path: /home/vscode/.promptflow/.runs/web_designer_flex_flow_20240826_163232_875744/logs.txt
[2024-08-26 16:32:35 +0000][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 10 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2024-08-26 16:32:35 +0000][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded token rate limit of you

2024-08-26 16:33:54 +0000   89637 execution.bulk     INFO     Process 89681 terminated.
2024-08-26 16:33:54 +0000   89637 execution.bulk     INFO     Process 89670 terminated.
2024-08-26 16:32:33 +0000   80823 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-08-26 16:32:33 +0000   80823 execution.bulk     INFO     Set process count to 3 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 3}.
2024-08-26 16:32:35 +0000   80823 execution.bulk     INFO     Process name(ForkProcess-8:3)-Process id(89682)-Line number(1) start execution.
2024-08-26 16:32:35 +0000   80823 execution.bulk     INFO     Process name(ForkProcess-8:2)-Process id(89681)-Line number(2) start execution.
2024-08-26 16:32:35 +0000   80823 execution.bulk     INFO     Process name(ForkProcess-8:1)-Process id(89670)-Line number(0) start execution.
2024-08-26 16:32:50 +0000   80823 execution.bulk     INFO     Process nam

[2024-08-26 16:33:57 +0000][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current AIServices S0 pricing tier. Please retry after 9 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2024-08-26 16:33:57 +0000][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the Chatcompletions_Create Operation under Azure OpenAI API version 2023-03-15-preview have exceeded call rate limit of your current AIServices S0 pricing tier. Please retry after 8 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2024-08-26 16:33:57 +0000][promptflow.core._prompty_utils][ERROR] - Exc

In [12]:
# Showing results
eval_result = pd.DataFrame(result_eval["rows"])
eval_result.head()

Unnamed: 0,outputs.answer,outputs.context,inputs.question,outputs.relevance.gpt_relevance,outputs.fluency.gpt_fluency,outputs.coherence.gpt_coherence,outputs.groundedness.gpt_groundedness
0,# Tents Catalog\n\n## Discover Your Perfect Ou...,"[{'id': 'cHJvZHVjdHMuY3N2Mg==', 'title': '', '...",Create the website copy for the tents catalog ...,5.0,5.0,5.0,1.0
1,### MountainDream Sleeping Bag\n\n#### Product...,"[{'id': 'cHJvZHVjdHMuY3N2Mg==', 'title': '', '...",Create the textual assets for the sleeping bag...,5.0,5.0,5.0,5.0
2,### TrekReady TrailWalker Hiking Shoes\n\n**Ad...,"[{'id': 'cHJvZHVjdHMuY3N2Mg==', 'title': '', '...",Draft the website copy for the hiking shoes we...,5.0,5.0,5.0,5.0


In [13]:
#save evaluation results to a JSONL file
eval_result.to_json('eval_result.jsonl', orient='records', lines=True)