パフォーマンスと品質評価ツールの使用

In [None]:
import os
from azure.ai.evaluation import GroundednessProEvaluator, GroundednessEvaluator, RetrievalEvaluator, RelevanceEvaluator
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
#credential = DefaultAzureCredential()

# For Groundedness Evaluator
model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_CHAT_MODEL"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

# For Groundedness Pro Evaluator
azure_ai_project = {
    "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP"),
    "project_name": os.environ.get("AZURE_PROJECT_NAME"),
}
tenant_id = os.environ.get("TENANT_ID")
credential = InteractiveBrowserCredential(tenant_id=tenant_id) 


In [None]:
with open("./answer_sample.md", "r", encoding = 'utf-8') as f:
    response = f.read()
with open("./context_sample.txt", "r", encoding = 'utf-8') as f:
    context = f.read()
with open("./ground_truth.md", "r", encoding = 'utf-8') as f:
    ground_truth = f.read()
query = "PCS警告灯が点滅または点灯する場合の対処法"


### GroundednessEvaluator

In [None]:
# Initialzing Groundedness evaluator
groundedness_eval = GroundednessEvaluator(model_config)

query_response = dict(query=query, context=context, response=response)
# Running Groundedness Evaluator on a query and response pair
groundedness_score = groundedness_eval(
    **query_response
)
print(groundedness_score)


### RetrievalEvaluator

In [None]:
retrieval_eval = RetrievalEvaluator(model_config)
query_response = dict(query=query, context=context)

relevance_score = retrieval_eval(**query_response)
print(relevance_score)
relevance_score["retrieval"]

### RelevanceEvaluator

In [None]:
relevance_eval = RelevanceEvaluator(model_config)
query_response = dict(query=query, response=context)

relevance_score = relevance_eval(**query_response)
print(relevance_score)

### **[Optional]** Groundedness Pro evaluator

In [None]:
# Initialzing Groundedness Pro evaluator
# Supported regions are ueaastus2 and sweedencentral
groundedness_pro_eval = GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential)
query_response = dict(query=query, context=context, response=response)

groundedness_pro_score = groundedness_pro_eval(
    **query_response
)
print(groundedness_pro_score)

### BleuScoreEvaluator
BLEU score measures the similarity by shared n-grams between the generated text and the ground truth, focusing more on precision and indirectly on recall.

In [None]:
from azure.ai.evaluation import BleuScoreEvaluator
bleu = BleuScoreEvaluator()

query_response = dict(response=response, ground_truth=ground_truth)
result = bleu(
    **query_response
)

print(result)

### GleuScoreEvaluator
GLEU score measures the similarity by shared n-grams between the generated text and ground truth, similar to the BLEU score, focusing on both precision and recall. It addresses the drawbacks of the BLEU score using a per-sentence reward objective.

In [None]:
from azure.ai.evaluation import GleuScoreEvaluator
gleu = GleuScoreEvaluator()

query_response = dict(response=response, ground_truth=ground_truth)

result = gleu(
    **query_response
)

print(result)

### MeteorScoreEvaluator
METEOR score measures the similarity by shared n-grams between the generated text and the ground truth, similar to the BLEU score, focusing on precision and recall. It addresses limitations of other metrics like the BLEU score by considering synonyms, stemming, and paraphrasing for content alignment.

In [None]:
from azure.ai.evaluation import MeteorScoreEvaluator
meteor = MeteorScoreEvaluator(alpha=0.9, beta=3.0, gamma=0.5)

query_response = dict(response=response, ground_truth=ground_truth)

result = meteor(
    **query_response
)

print(result)

### RougeScoreEvaluator
ROUGE measures the similarity by shared n-grams between the generated text and the ground truth. <br>
ROUGE precision reflects the fraction of the n-grams in the response that are also in the ground truth. <br>
ROUGE recall is the fraction of n-grams in ground truth that also appear in the response. <br>
ROUGE f1_score is calculated from ROUGE precision and ROUGE recall.

In [None]:
from azure.ai.evaluation import RougeScoreEvaluator, RougeType
rouge = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)

query_response = dict(response=response, ground_truth=ground_truth)

result = rouge(
    **query_response
)

print(result)

### **[Optional]** Tracking the evaluation results in Azure AI Foundry project

In [None]:
from azure.ai.evaluation import evaluate

result = evaluate(
    data="query_response.jsonl",
    evaluators={
        "bleu": bleu,
        "gleu": gleu,
        "meteor": meteor,
        "rouge": rouge,
    },
    # Optionally provide your AI Studio project information to track your evaluation results in your Azure AI Studio project
    azure_ai_project=azure_ai_project,
)