Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding telemetry for evaluate API #3473

Merged
merged 5 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import pandas as pd

from promptflow._sdk._constants import LINE_NUMBER
from promptflow._sdk._telemetry import ActivityType, log_activity
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
from promptflow.client import PFClient
from ._telemetry import log_evaluate_activity

from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes
from .._user_agent import USER_AGENT
Expand Down Expand Up @@ -258,7 +257,7 @@ def _rename_columns_conditionally(df: pd.DataFrame):
return df


@log_activity(get_telemetry_logger(), "pf.evals.evaluate", activity_type=ActivityType.PUBLICAPI, user_agent=USER_AGENT)
@log_evaluate_activity
def evaluate(
*,
evaluation_name: Optional[str] = None,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

__path__ = __import__("pkgutil").extend_path(__path__, __name__)

import functools
import json
import logging
import inspect

import pandas as pd

from .._utils import _trace_destination_from_project_scope
from ..._user_agent import USER_AGENT
from promptflow.core import Prompty as prompty_core
from promptflow._sdk.entities._flows import Prompty as prompty_sdk, FlexFlow as flex_flow
from promptflow._sdk.entities._flows.dag import Flow as dag_flow
from promptflow.client import PFClient

LOGGER = logging.getLogger(__name__)


def _get_evaluator_type(evaluator):
"""
Get evaluator type for telemetry. Possible values are "built-in", "custom" and "content-safety"
"""
built_in = False
content_safety = False

module = inspect.getmodule(evaluator)
built_in = (module and module.__name__.startswith("promptflow.evals.evaluators."))

if built_in:
content_safety = module.__name__.startswith("promptflow.evals.evaluators._content_safety")

return "content-safety" if content_safety else "built-in" if built_in else "custom"


def _get_evaluator_properties(evaluator, evaluator_name):
"""
Get evaluator properties for telemetry
It gets name, pf_type, and type
name : tries best to get the most meaningful name for the evaluator
pf_type : The type of promptflow being used
type : The type of evaluator being used. Possible values are "built-in", "custom" and "content-safety"
"""

try:
# Cover flex flow and prompty based evaluator
if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
name = evaluator.name
pf_type = evaluator.__class__.__name__
# Cover dag flow based evaluator
elif isinstance(evaluator, dag_flow):
name = evaluator.name
pf_type = "DagFlow"
elif inspect.isfunction(evaluator):
name = evaluator.__name__
pf_type = flex_flow.__name__
elif hasattr(evaluator, "__class__") and callable(evaluator):
name = evaluator.__class__.__name__
pf_type = flex_flow.__name__
else:
# fallback option
name = str(evaluator)
pf_type = "Unknown"
except Exception as e:
LOGGER.debug(f"Failed to get evaluator properties: {e}")
name = str(evaluator)
pf_type = "Unknown"

return {
"name": name,
"pf_type": pf_type,
"type": _get_evaluator_type(evaluator),
"alias": evaluator_name if evaluator_name else ""
}


# cspell:ignore isna
def log_evaluate_activity(func):
"""Decorator to log evaluate activity"""

@functools.wraps(func)
def wrapper(*args, **kwargs):
from promptflow._sdk._telemetry import ActivityType, log_activity
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger

evaluators = kwargs.get("evaluators", [])
azure_ai_project = kwargs.get("azure_ai_project", None)

pf_client = PFClient(
config={
"trace.destination": _trace_destination_from_project_scope(
azure_ai_project)} if azure_ai_project else None,
user_agent=USER_AGENT,
)

track_in_cloud = True if pf_client._config.get_trace_destination() else False
evaluate_target = True if kwargs.get("target", None) else False
evaluator_config = True if kwargs.get("evaluator_config", None) else False
custom_dimensions = {
"track_in_cloud": track_in_cloud,
"evaluate_target": evaluate_target,
"evaluator_config": evaluator_config,
}

with log_activity(get_telemetry_logger(), "pf.evals.evaluate", activity_type=ActivityType.PUBLICAPI,
user_agent=USER_AGENT, custom_dimensions=custom_dimensions):
result = func(*args, **kwargs)

try:
evaluators_info = []
for evaluator_name, evaluator in evaluators.items():
evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
try:
evaluator_df = pd.DataFrame(result.get("rows", [])).filter(like=f"outputs.{evaluator_name}",
axis=1)

failed_rows = evaluator_df.shape[0] if evaluator_df.empty else int(
evaluator_df.isna().any(axis=1).sum())
total_rows = evaluator_df.shape[0]

evaluator_info["failed_rows"] = failed_rows
evaluator_info["total_rows"] = total_rows
except Exception as e:
LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
evaluators_info.append(evaluator_info)

custom_dimensions = {
"evaluators_info": json.dumps(evaluators_info)
}
with log_activity(get_telemetry_logger(), "pf.evals.evaluate_usage_info",
activity_type=ActivityType.PUBLICAPI, user_agent=USER_AGENT,
custom_dimensions=custom_dimensions):
pass
except Exception as e:
LOGGER.debug(f"Failed to collect evaluate usage info: {e}")

return result

return wrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import json
import os
import pathlib
from typing import Optional, Callable, Dict
from unittest.mock import patch, MagicMock

import numpy as np
import pandas as pd
import pytest

from promptflow.evals.evaluate._telemetry import log_evaluate_activity
from promptflow.evals.evaluators import F1ScoreEvaluator, HateUnfairnessEvaluator
from promptflow.client import load_flow


def _add_nans(df, n, column_name):
mask = np.full(df.shape[0], False) # Start with an all False mask (no NaNs)
mask[:n] = True # Set the first 'n' values to True
np.random.shuffle(mask) # Shuffle to distribute the NaNs randomly

# Apply the mask to assign NaNs in the DataFrame column
df.loc[mask, column_name] = np.nan


def _get_file(name):
"""Get the file from the unittest data folder."""
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
return os.path.join(data_path, name)


def answer_length(answer):
return len(answer)


@pytest.fixture
def mock_app_insight_logger():
"""Mock validate trace destination config to use in unit tests."""
logger = MagicMock()
logger.info = MagicMock()
logger.error = MagicMock()
with patch("promptflow._sdk._telemetry.telemetry.get_telemetry_logger", return_value=logger):
yield logger


@log_evaluate_activity
def dummy_evaluate_function(
*,
evaluation_name: Optional[str] = None,
target: Optional[Callable] = None,
data: Optional[str] = None,
evaluators: Optional[Dict[str, Callable]] = None,
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
azure_ai_project: Optional[Dict] = None,
output_path: Optional[str] = None,
**kwargs,
):
df = pd.read_json(data, lines=True)
nan_count = kwargs.get("number_of_nans", 1)
for evaluation_name, evaluator in evaluators.items():

df[f'outputs.{evaluation_name}.score'] = np.random.randint(0, 100, df.shape[0])
_add_nans(df, nan_count, f'outputs.{evaluation_name}.score')

# Add a new column with random strings
df[f'outputs.{evaluation_name}.reason'] = np.random.choice(['a', 'b', 'c', 'd', 'e'], df.shape[0])

return {
"rows": df.to_dict(orient="records"),
}


class TestEvaluateTelemetry:
def test_evaluators_telemetry(self, mock_app_insight_logger):
f1_score = F1ScoreEvaluator()
apology_dag = load_flow(os.path.join(pathlib.Path(__file__).parent.resolve(), "test_evaluators", "apology_dag"))
apology_prompty = load_flow(os.path.join(pathlib.Path(__file__).parent.resolve(),
"test_evaluators", "apology_prompty", "apology.prompty"))

data = _get_file("evaluate_test_data.jsonl")
evaluators = {
"f1_score": f1_score,
"apology_dag": apology_dag,
"apology_prompty": apology_prompty,
"answer_length": answer_length
}

dummy_evaluate_function(
evaluators=evaluators,
data=data,
number_of_nans=1
)

evaluate_start_call = [call for call in mock_app_insight_logger.info.call_args_list if
"pf.evals.evaluate.start" in call.args[0]]
evaluate_start_call_cd = evaluate_start_call[0].kwargs["extra"]["custom_dimensions"]

evaluate_usage_info_call = [call for call in mock_app_insight_logger.info.call_args_list if
"pf.evals.evaluate_usage_info.start" in call.args[0]]
evaluate_usage_info_call_cd = evaluate_usage_info_call[0].kwargs["extra"]["custom_dimensions"]

assert mock_app_insight_logger.info.call_count == 4
assert len(evaluate_start_call) == 1
assert len(evaluate_usage_info_call) == 1

# asserts for evaluate start activity
assert evaluate_start_call_cd["track_in_cloud"] is False
assert evaluate_start_call_cd["evaluate_target"] is False
assert evaluate_start_call_cd["evaluator_config"] is False

# asserts for evaluate usage info activity
evaluators_info = json.loads(evaluate_usage_info_call_cd["evaluators_info"])
assert len(evaluators_info) == 4
for entry in evaluators_info:
if entry["alias"] == "f1_score":
assert entry["pf_type"] == "FlexFlow"
assert entry["name"] == "F1ScoreEvaluator"
assert entry["type"] == "built-in"
if entry["alias"] == "apology_dag":
assert entry["pf_type"] == "DagFlow"
assert entry["name"] == "apology_dag"
assert entry["type"] == "custom"
if entry["alias"] == "apology_prompty":
assert entry["pf_type"] == "Prompty"
assert entry["name"] == "apology_prompty"
assert entry["type"] == "custom"
if entry["alias"] == "answer_length":
assert entry["pf_type"] == "FlexFlow"
assert entry["name"] == "answer_length"
assert entry["type"] == "custom"

assert entry["failed_rows"] == 1

def test_evaluator_start_telemetry(self, mock_app_insight_logger, mock_project_scope,
mock_trace_destination_to_cloud, mock_validate_trace_destination):
hate_unfairness = HateUnfairnessEvaluator(project_scope=None)

data = _get_file("evaluate_test_data.jsonl")
evaluators = {
"hate_unfairness": hate_unfairness,
}

dummy_evaluate_function(
target=answer_length,
evaluators=evaluators,
data=data,
number_of_nans=2,
azure_ai_project=mock_project_scope,
evaluator_config={"hate_unfairness": {"model_config": "test_config"}}

)

evaluate_start_call = [call for call in mock_app_insight_logger.info.call_args_list if
"pf.evals.evaluate.start" in call.args[0]]
evaluate_start_call_cd = evaluate_start_call[0].kwargs["extra"]["custom_dimensions"]

# asserts for evaluate start activity
assert evaluate_start_call_cd["track_in_cloud"] is True
assert evaluate_start_call_cd["evaluate_target"] is True
assert evaluate_start_call_cd["evaluator_config"] is True
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import re
from promptflow.core import tool


@tool
def apology(answer):
return len(re.findall('(sorry)|(apology)|(apologies)', answer.lower()))
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
inputs:
answer:
type: string
default: "fun"
outputs:
answer:
type: string
reference: ${apology.output}
nodes:
- name: apology
type: python
source:
type: code
path: apology.py
inputs:
answer: ${inputs.answer}

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
name: basic evaluate
description: basic evaluator for QA scenario
model:
api: chat
configuration:
type: azure_openai
azure_deployment: gpt-35-turbo-1106
api_key: ${env:AZURE_OPENAI_API_KEY}
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
parameters:
temperature: 0.2
max_tokens: 200
top_p: 1.0
response_format:
type: json_object

inputs:
answer:
type: string
outputs:
score:
type: string
---
system:
You are an AI assistant.
You task is to answer if answer contains an apology. If Answer contains apology, return 1, otherwise return 0.
The output should be valid JSON.

**Example**
answer: "Sorry, I can only truth questions related to outdoor/camping gear and equipment"
output:
{"score": "1"}

user:
answer: {{answer}}
output:
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"question": "what's the capital of China?",
"answer": "Shanghai"
}
Loading