Fix docstrings (#3449)

# Description Fix the problem with not showing docstrings for the evaluators. https://microsoft.github.io/promptflow/reference/python-library-reference/promptflow-evals/promptflow.evals.evaluators.html. See work item 3305596. # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [x] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [x] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [x] Pull request includes test coverage for the included changes.
microsoft · Jun 25, 2024 · 2eb9cb4 · 2eb9cb4
1 parent 181952a
commit 2eb9cb4
Show file tree

Hide file tree

Showing 18 changed files with 531 additions and 308 deletions.
diff --git a/scripts/docs/conf.py b/scripts/docs/conf.py
@@ -152,6 +152,12 @@
 myst_heading_anchors = 5
 
 
+# allow annotation for __call__ methods
+autodoc_default_options = {
+    'special-members': '__call__',
+}
+
+
 def setup(app):
     # Add the gallery directive
     app.add_directive("gallery-grid", GalleryDirective)
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py
@@ -22,39 +22,59 @@
 
 
 class ChatEvaluator:
+    """
+    Initialize a chat evaluator configured for a specific Azure OpenAI model.
+
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: AzureOpenAIModelConfiguration
+    :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
+        focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
+    :type eval_last_turn: bool
+    :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
+        Default is True.
+    :type parallel: bool
+    :return: A function that evaluates and generates metrics for "chat" scenario.
+    :rtype: function
+
+    **Usage**
+
+    .. code-block:: python
+
+        chat_eval = ChatEvaluator(model_config)
+        conversation = [
+            {"role": "user", "content": "What is the value of 2 + 2?"},
+            {"role": "assistant", "content": "2 + 2 = 4", "context": {
+                "citations": [
+                        {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
+                        ]
+                }
+            }
+        ]
+        result = chat_eval(conversation=conversation)
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "evaluation_per_turn": {
+                "gpt_retrieval": [1.0, 2.0],
+                "gpt_groundedness": [5.0, 2.0],
+                "gpt_relevance": [3.0, 5.0],
+                "gpt_coherence": [1.0, 2.0],
+                "gpt_fluency": [3.0, 5.0]
+            }
+            "gpt_retrieval": 1.5,
+            "gpt_groundedness": 3.5,
+            "gpt_relevance": 4.0,
+            "gpt_coherence": 1.5,
+            "gpt_fluency": 4.0
+        }
+    """
+
     def __init__(
         self, model_config: AzureOpenAIModelConfiguration, eval_last_turn: bool = False, parallel: bool = True
     ):
-        """
-        Initialize an evaluator configured for a specific Azure OpenAI model.
-
-        :param model_config: Configuration for the Azure OpenAI model.
-        :type model_config: AzureOpenAIModelConfiguration
-        :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
-            focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
-        :type eval_last_turn: bool
-        :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
-            Default is True.
-        :type parallel: bool
-        :return: A function that evaluates and generates metrics for "chat" scenario.
-        :rtype: function
-
-        **Usage**
-
-        .. code-block:: python
-
-            chat_eval = ChatEvaluator(model_config)
-            conversation = [
-                {"role": "user", "content": "What is the value of 2 + 2?"},
-                {"role": "assistant", "content": "2 + 2 = 4", "context": {
-                    "citations": [
-                            {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
-                            ]
-                    }
-                }
-            ]
-            result = chat_eval(conversation=conversation)
-        """
         self._eval_last_turn = eval_last_turn
         self._parallel = parallel
 
@@ -73,7 +93,8 @@ def __init__(
         self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)
 
     def __call__(self, *, conversation, **kwargs):
-        """Evaluates chat scenario.
+        """
+        Evaluates chat scenario.
 
         :param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
             "context" key is optional for assistant's turn and should have "citations" key with list of citations.
@@ -222,7 +243,8 @@ def _validate_conversation(self, conversation: List[Dict]):
             one_based_turn_num = turn_num + 1
 
             if not isinstance(turn, dict):
-                raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}")
+                raise ValueError(
+                    f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}")
 
             if "role" not in turn or "content" not in turn:
                 raise ValueError(

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py
@@ -16,30 +16,44 @@
 
 
 class RetrievalChatEvaluator:
-    def __init__(self, model_config: AzureOpenAIModelConfiguration):
-        """
-        Initialize an evaluator configured for a specific Azure OpenAI model.
-
-        :param model_config: Configuration for the Azure OpenAI model.
-        :type model_config: AzureOpenAIModelConfiguration
-        :return: A function that evaluates and generates metrics for "chat" scenario.
-        :rtype: function
-        **Usage**
-
-        .. code-block:: python
-
-            chat_eval = RetrievalChatEvaluator(model_config)
-            conversation = [
-                {"role": "user", "content": "What is the value of 2 + 2?"},
-                {"role": "assistant", "content": "2 + 2 = 4", "context": {
-                    "citations": [
-                            {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
-                            ]
-                    }
+    """
+    Initialize an evaluator configured for a specific Azure OpenAI model.
+
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: AzureOpenAIModelConfiguration
+    :return: A function that evaluates and generates metrics for "chat" scenario.
+    :rtype: function
+    **Usage**
+
+    .. code-block:: python
+
+        chat_eval = RetrievalChatEvaluator(model_config)
+        conversation = [
+            {"role": "user", "content": "What is the value of 2 + 2?"},
+            {"role": "assistant", "content": "2 + 2 = 4", "context": {
+                "citations": [
+                        {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
+                        ]
                 }
-            ]
-            result = chat_eval(conversation=conversation)
-        """
+            }
+        ]
+        result = chat_eval(conversation=conversation)
+
+    **Output format**
+
+    .. code-block:: python
+
+    {
+        "gpt_retrieval": 3.0
+        "evaluation_per_turn": {
+            "gpt_retrieval": {
+                "score": [1.0, 2.0, 3.0]
+            }
+        }
+    }
+    """
+
+    def __init__(self, model_config: AzureOpenAIModelConfiguration):
         # TODO: Remove this block once the bug is fixed
         # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324
         if model_config.api_version is None:

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py b/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py
@@ -12,22 +12,31 @@
 
 
 class CoherenceEvaluator:
-    def __init__(self, model_config: AzureOpenAIModelConfiguration):
-        """
-        Initialize an evaluator configured for a specific Azure OpenAI model.
+    """
+    Initialize a coherence evaluator configured for a specific Azure OpenAI model.
 
-        :param model_config: Configuration for the Azure OpenAI model.
-        :type model_config: AzureOpenAIModelConfiguration
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: AzureOpenAIModelConfiguration
 
-        **Usage**
+    **Usage**
 
-        .. code-block:: python
+    .. code-block:: python
 
-            eval_fn = CoherenceEvaluator(model_config)
-            result = eval_fn(
-                question="What is the capital of Japan?",
-                answer="The capital of Japan is Tokyo.")
-        """
+        eval_fn = CoherenceEvaluator(model_config)
+        result = eval_fn(
+            question="What is the capital of Japan?",
+            answer="The capital of Japan is Tokyo.")
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "gpt_coherence": 1.0
+        }
+    """
+
+    def __init__(self, model_config: AzureOpenAIModelConfiguration):
         # TODO: Remove this block once the bug is fixed
         # https://msdata.visualstudio.com/Vienna/_workitems/edit/3151324
         if model_config.api_version is None:
@@ -39,7 +48,9 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration):
         self._flow = load_flow(source=prompty_path, model=prompty_model_config)
 
     def __call__(self, *, question: str, answer: str, **kwargs):
-        """Evaluate coherence.
+        """
+        Evaluate coherence.
+
         :param question: The question to be evaluated.
         :type question: str
         :param answer: The answer to be evaluated.

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py
@@ -7,35 +7,55 @@
 
 
 class ContentSafetyEvaluator:
-    def __init__(self, project_scope: dict, parallel: bool = True, credential=None):
-        """
-        Initialize an evaluator configured to evaluate content safetry metrics for QA scenario.
+    """
+    Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
 
-        :param project_scope: The scope of the Azure AI project.
-            It contains subscription id, resource group, and project name.
-        :type project_scope: dict
-        :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
-            Default is True.
-        :param credential: The credential for connecting to Azure AI project.
-        :type credential: TokenCredential
-        :return: A function that evaluates content-safety metrics for "question-answering" scenario.
-        :rtype: function
+    :param project_scope: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type project_scope: dict
+    :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
+        Default is True.
+    :param credential: The credential for connecting to Azure AI project.
+    :type credential: TokenCredential
+    :return: A function that evaluates content-safety metrics for "question-answering" scenario.
+    :rtype: function
 
-        **Usage**
+    **Usage**
 
-        .. code-block:: python
+    .. code-block:: python
 
-            project_scope = {
-                "subscription_id": "<subscription_id>",
-                "resource_group_name": "<resource_group_name>",
-                "project_name": "<project_name>",
-            }
-            eval_fn = ContentSafetyEvaluator(project_scope)
-            result = eval_fn(
-                question="What is the capital of France?",
-                answer="Paris.",
-            )
-        """
+        project_scope = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        eval_fn = ContentSafetyEvaluator(project_scope)
+        result = eval_fn(
+            question="What is the capital of France?",
+            answer="Paris.",
+        )
+
+    **Output format**
+
+    .. code-block:: python
+
+        {
+            "violence": "Medium",
+            "violence_score": 5.0,
+            "violence_reason": "Some reason",
+            "sexual": "Medium",
+            "sexual_score": 5.0,
+            "sexual_reason": "Some reason",
+            "self_harm": "Medium",
+            "self_harm_score": 5.0,
+            "self_harm_reason": "Some reason",
+            "hate_unfairness": "Medium",
+            "hate_unfairness_score": 5.0,
+            "hate_unfairness_reason": "Some reason"
+        }
+    """
+
+    def __init__(self, project_scope: dict, parallel: bool = True, credential=None):
         self._parallel = parallel
         self._evaluators = [
             ViolenceEvaluator(project_scope, credential),
@@ -45,7 +65,8 @@ def __init__(self, project_scope: dict, parallel: bool = True, credential=None):
         ]
 
     def __call__(self, *, question: str, answer: str, **kwargs):
-        """Evaluates content-safety metrics for "question-answering" scenario.
+        """
+        Evaluates content-safety metrics for "question-answering" scenario.
 
         :param question: The question to be evaluated.
         :type question: str