microsoft · singankit · Jun 27, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
@@ -9,9 +9,8 @@
 import pandas as pd
 
 from promptflow._sdk._constants import LINE_NUMBER
-from promptflow._sdk._telemetry import ActivityType, log_activity
-from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
 from promptflow.client import PFClient
+from ._telemetry import log_evaluate_activity
 
 from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes
 from .._user_agent import USER_AGENT
@@ -258,7 +257,7 @@ def _rename_columns_conditionally(df: pd.DataFrame):
     return df
 
 
-@log_activity(get_telemetry_logger(), "pf.evals.evaluate", activity_type=ActivityType.PUBLICAPI, user_agent=USER_AGENT)
+@log_evaluate_activity
 def evaluate(
     *,
     evaluation_name: Optional[str] = None,

@@ -0,0 +1,143 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
+
+import functools
+import json
+import logging
+import inspect
+
+import pandas as pd
+
+from .._utils import _trace_destination_from_project_scope
+from ..._user_agent import USER_AGENT
+from promptflow.core import Prompty as prompty_core
+from promptflow._sdk.entities._flows import Prompty as prompty_sdk, FlexFlow as flex_flow
+from promptflow._sdk.entities._flows.dag import Flow as dag_flow
+from promptflow.client import PFClient
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _get_evaluator_type(evaluator):
+    """
+    Get evaluator type for telemetry. Possible values are "built-in", "custom" and "content-safety"
+    """
+    built_in = False
+    content_safety = False
+
+    module = inspect.getmodule(evaluator)
+    built_in = (module and module.__name__.startswith("promptflow.evals.evaluators."))
+
+    if built_in:
+        content_safety = module.__name__.startswith("promptflow.evals.evaluators._content_safety")
+
+    return "content-safety" if content_safety else "built-in" if built_in else "custom"
+
+
+def _get_evaluator_properties(evaluator, evaluator_name):
+    """
+    Get evaluator properties for telemetry
+    It gets name, pf_type, and type
+    name : tries best to get the most meaningful name for the evaluator
+    pf_type : The type of promptflow being used
+    type : The type of evaluator being used. Possible values are "built-in", "custom" and "content-safety"
+    """
+
+    try:
+        # Cover flex flow and prompty based evaluator
+        if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
+            name = evaluator.name
+            pf_type = evaluator.__class__.__name__
+        # Cover dag flow based evaluator
+        elif isinstance(evaluator, dag_flow):
+            name = evaluator.name
+            pf_type = "DagFlow"
+        elif inspect.isfunction(evaluator):
+            name = evaluator.__name__
+            pf_type = flex_flow.__name__
+        elif hasattr(evaluator, "__class__") and callable(evaluator):
+            name = evaluator.__class__.__name__
+            pf_type = flex_flow.__name__
+        else:
+            # fallback option
+            name = str(evaluator)
+            pf_type = "Unknown"
+    except Exception as e:
+        LOGGER.debug(f"Failed to get evaluator properties: {e}")
+        name = str(evaluator)
+        pf_type = "Unknown"
+
+    return {
+        "name": name,
+        "pf_type": pf_type,
+        "type": _get_evaluator_type(evaluator),
+        "alias": evaluator_name if evaluator_name else ""
+    }
+
+
+# cspell:ignore isna
+def log_evaluate_activity(func):
+    """Decorator to log evaluate activity"""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        from promptflow._sdk._telemetry import ActivityType, log_activity
+        from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
+
+        evaluators = kwargs.get("evaluators", [])
+        azure_ai_project = kwargs.get("azure_ai_project", None)
+
+        pf_client = PFClient(
+            config={
+                "trace.destination": _trace_destination_from_project_scope(
+                    azure_ai_project)} if azure_ai_project else None,
+            user_agent=USER_AGENT,
+        )
+
+        track_in_cloud = True if pf_client._config.get_trace_destination() else False
+        evaluate_target = True if kwargs.get("target", None) else False
+        evaluator_config = True if kwargs.get("evaluator_config", None) else False
+        custom_dimensions = {
+            "track_in_cloud": track_in_cloud,
+            "evaluate_target": evaluate_target,
+            "evaluator_config": evaluator_config,
+        }
+
+        with log_activity(get_telemetry_logger(), "pf.evals.evaluate", activity_type=ActivityType.PUBLICAPI,
+                          user_agent=USER_AGENT, custom_dimensions=custom_dimensions):
+            result = func(*args, **kwargs)
+
+            try:
+                evaluators_info = []
+                for evaluator_name, evaluator in evaluators.items():
+                    evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
+                    try:
+                        evaluator_df = pd.DataFrame(result.get("rows", [])).filter(like=f"outputs.{evaluator_name}",
+                                                                                   axis=1)
+
+                        failed_rows = evaluator_df.shape[0] if evaluator_df.empty else int(
+                                                                    evaluator_df.isna().any(axis=1).sum())
+                        total_rows = evaluator_df.shape[0]
+
+                        evaluator_info["failed_rows"] = failed_rows
+                        evaluator_info["total_rows"] = total_rows
+                    except Exception as e:
+                        LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
+                    evaluators_info.append(evaluator_info)
+
+                custom_dimensions = {
+                    "evaluators_info": json.dumps(evaluators_info)
+                }
+                with log_activity(get_telemetry_logger(), "pf.evals.evaluate_usage_info",
+                                  activity_type=ActivityType.PUBLICAPI, user_agent=USER_AGENT,
+                                  custom_dimensions=custom_dimensions):
+                    pass
+            except Exception as e:
+                LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
+
+            return result
+
+    return wrapper
@@ -0,0 +1,159 @@
+import json
+import os
+import pathlib
+from typing import Optional, Callable, Dict
+from unittest.mock import patch, MagicMock
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from promptflow.evals.evaluate._telemetry import log_evaluate_activity
+from promptflow.evals.evaluators import F1ScoreEvaluator, HateUnfairnessEvaluator
+from promptflow.client import load_flow
+
+
+def _add_nans(df, n, column_name):
+    mask = np.full(df.shape[0], False)  # Start with an all False mask (no NaNs)
+    mask[:n] = True  # Set the first 'n' values to True
+    np.random.shuffle(mask)  # Shuffle to distribute the NaNs randomly
+
+    # Apply the mask to assign NaNs in the DataFrame column
+    df.loc[mask, column_name] = np.nan
+
+
+def _get_file(name):
+    """Get the file from the unittest data folder."""
+    data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
+    return os.path.join(data_path, name)
+
+
+def answer_length(answer):
+    return len(answer)
+
+
+@pytest.fixture
+def mock_app_insight_logger():
+    """Mock validate trace destination config to use in unit tests."""
+    logger = MagicMock()
+    logger.info = MagicMock()
+    logger.error = MagicMock()
+    with patch("promptflow._sdk._telemetry.telemetry.get_telemetry_logger", return_value=logger):
+        yield logger
+
+
+@log_evaluate_activity
+def dummy_evaluate_function(
+    *,
+    evaluation_name: Optional[str] = None,
+    target: Optional[Callable] = None,
+    data: Optional[str] = None,
+    evaluators: Optional[Dict[str, Callable]] = None,
+    evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
+    azure_ai_project: Optional[Dict] = None,
+    output_path: Optional[str] = None,
+    **kwargs,
+):
+    df = pd.read_json(data, lines=True)
+    nan_count = kwargs.get("number_of_nans", 1)
+    for evaluation_name, evaluator in evaluators.items():
+
+        df[f'outputs.{evaluation_name}.score'] = np.random.randint(0, 100, df.shape[0])
+        _add_nans(df, nan_count, f'outputs.{evaluation_name}.score')
+
+        # Add a new column with random strings
+        df[f'outputs.{evaluation_name}.reason'] = np.random.choice(['a', 'b', 'c', 'd', 'e'], df.shape[0])
+
+    return {
+     "rows": df.to_dict(orient="records"),
+    }
+
+
+class TestEvaluateTelemetry:
+    def test_evaluators_telemetry(self, mock_app_insight_logger):
+        f1_score = F1ScoreEvaluator()
+        apology_dag = load_flow(os.path.join(pathlib.Path(__file__).parent.resolve(), "test_evaluators", "apology_dag"))
+        apology_prompty = load_flow(os.path.join(pathlib.Path(__file__).parent.resolve(),
+                                    "test_evaluators", "apology_prompty", "apology.prompty"))
+
+        data = _get_file("evaluate_test_data.jsonl")
+        evaluators = {
+            "f1_score": f1_score,
+            "apology_dag": apology_dag,
+            "apology_prompty": apology_prompty,
+            "answer_length": answer_length
+        }
+
+        dummy_evaluate_function(
+            evaluators=evaluators,
+            data=data,
+            number_of_nans=1
+        )
+
+        evaluate_start_call = [call for call in mock_app_insight_logger.info.call_args_list if
+                               "pf.evals.evaluate.start" in call.args[0]]
+        evaluate_start_call_cd = evaluate_start_call[0].kwargs["extra"]["custom_dimensions"]
+
+        evaluate_usage_info_call = [call for call in mock_app_insight_logger.info.call_args_list if
+                                    "pf.evals.evaluate_usage_info.start" in call.args[0]]
+        evaluate_usage_info_call_cd = evaluate_usage_info_call[0].kwargs["extra"]["custom_dimensions"]
+
+        assert mock_app_insight_logger.info.call_count == 4
+        assert len(evaluate_start_call) == 1
+        assert len(evaluate_usage_info_call) == 1
+
+        # asserts for evaluate start activity
+        assert evaluate_start_call_cd["track_in_cloud"] is False
+        assert evaluate_start_call_cd["evaluate_target"] is False
+        assert evaluate_start_call_cd["evaluator_config"] is False
+
+        # asserts for evaluate usage info activity
+        evaluators_info = json.loads(evaluate_usage_info_call_cd["evaluators_info"])
+        assert len(evaluators_info) == 4
+        for entry in evaluators_info:
+            if entry["alias"] == "f1_score":
+                assert entry["pf_type"] == "FlexFlow"
+                assert entry["name"] == "F1ScoreEvaluator"
+                assert entry["type"] == "built-in"
+            if entry["alias"] == "apology_dag":
+                assert entry["pf_type"] == "DagFlow"
+                assert entry["name"] == "apology_dag"
+                assert entry["type"] == "custom"
+            if entry["alias"] == "apology_prompty":
+                assert entry["pf_type"] == "Prompty"
+                assert entry["name"] == "apology_prompty"
+                assert entry["type"] == "custom"
+            if entry["alias"] == "answer_length":
+                assert entry["pf_type"] == "FlexFlow"
+                assert entry["name"] == "answer_length"
+                assert entry["type"] == "custom"
+
+            assert entry["failed_rows"] == 1
+
+    def test_evaluator_start_telemetry(self, mock_app_insight_logger, mock_project_scope,
+                                       mock_trace_destination_to_cloud, mock_validate_trace_destination):
+        hate_unfairness = HateUnfairnessEvaluator(project_scope=None)
+
+        data = _get_file("evaluate_test_data.jsonl")
+        evaluators = {
+            "hate_unfairness": hate_unfairness,
+        }
+
+        dummy_evaluate_function(
+            target=answer_length,
+            evaluators=evaluators,
+            data=data,
+            number_of_nans=2,
+            azure_ai_project=mock_project_scope,
+            evaluator_config={"hate_unfairness": {"model_config": "test_config"}}
+
+        )
+
+        evaluate_start_call = [call for call in mock_app_insight_logger.info.call_args_list if
+                               "pf.evals.evaluate.start" in call.args[0]]
+        evaluate_start_call_cd = evaluate_start_call[0].kwargs["extra"]["custom_dimensions"]
+
+        # asserts for evaluate start activity
+        assert evaluate_start_call_cd["track_in_cloud"] is True
+        assert evaluate_start_call_cd["evaluate_target"] is True
+        assert evaluate_start_call_cd["evaluator_config"] is True
@@ -0,0 +1,7 @@
+import re
+from promptflow.core import tool
+
+
+@tool
+def apology(answer):
+    return len(re.findall('(sorry)|(apology)|(apologies)', answer.lower()))
@@ -0,0 +1,18 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
+inputs:
+  answer:
+    type: string
+    default: "fun"
+outputs:
+  answer:
+    type: string
+    reference: ${apology.output}
+nodes:
+- name: apology
+  type: python
+  source:
+    type: code
+    path: apology.py
+  inputs:
+    answer: ${inputs.answer}
+
@@ -0,0 +1,37 @@
+---
+name: basic evaluate 
+description: basic evaluator for QA scenario
+model:
+  api: chat
+  configuration:
+    type: azure_openai
+    azure_deployment: gpt-35-turbo-1106
+    api_key: ${env:AZURE_OPENAI_API_KEY}
+    azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
+  parameters:
+    temperature: 0.2
+    max_tokens: 200
+    top_p: 1.0
+    response_format:
+      type: json_object
+
+inputs: 
+  answer:
+    type: string
+outputs:
+  score:
+    type: string
+---
+system:
+You are an AI assistant. 
+You task is to answer if answer contains an apology. If Answer contains apology, return 1, otherwise return 0.
+The output should be valid JSON.
+
+**Example**
+answer: "Sorry, I can only truth questions related to outdoor/camping gear and equipment"
+output:
+{"score": "1"}
+
+user: 
+answer: {{answer}}
+output:
@@ -0,0 +1,4 @@
+{
+  "question": "what's the capital of China?",
+  "answer": "Shanghai"
+}