feedback

Signed-off-by: B-Step62 <yuki.watanabe@databricks.com>
mlflow · Feb 29, 2024 · a62c2c3 · a62c2c3
1 parent 592f0df
commit a62c2c3
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 54 deletions.
diff --git a/docs/source/deployment/deploy-model-locally.rst b/docs/source/deployment/deploy-model-locally.rst
@@ -104,7 +104,7 @@ the required payload format, you can leverage the dict payload structures below.
       - Pandas DataFrame in the ``records`` orientation. **We do not recommend using this format because it is not guaranteed to preserve column ordering.**
       - 
         .. code-block:: python
-
+          
           {"dataframe_records": pandas_df.to_dict(orient="records")}
 
     * - ``instances``

diff --git a/docs/source/llms/llm-evaluate/index.rst b/docs/source/llms/llm-evaluate/index.rst
@@ -509,7 +509,7 @@ Please don't forget to set the target deployment client by using :py:func:`mlflo
 
 .. hint::
 
-    When you want to use an external endpoint **not** hosted by an MLflow Deployments Server or Databricks, you can create a custom Python function following the :ref:`Evaluating with a Custom Function <llm-eval-custom-function>` guide and use it as the ``model`` argument.
+    When you want to use an endpoint **not** hosted by an MLflow Deployments Server or Databricks, you can create a custom Python function following the :ref:`Evaluating with a Custom Function <llm-eval-custom-function>` guide and use it as the ``model`` argument.
 
 Supported Input Data Formats
 ****************************
@@ -555,38 +555,35 @@ The input data can be either of the following format when using an URI of the ML
                 ]
             }
 
-      - In this format, the dictionary should have the correct request format for your model endpoint.
+      - In this format, the dictionary should have the correct request format for your model endpoint. Please refer to the `MLflow Deployments documentation <../deployments/index.html#standard-query-parameters>`_ for more information about the request format for different model endpoint types.
 
-    * - A (nested) list of input strings.
+    * - A list of input strings.
       - 
         .. code-block:: python
 
             [
-                ["What is MLflow?"],
-                ["What is Spark?"],
+                "What is MLflow?",
+                "What is Spark?",
             ]
 
-      - The :py:func:`mlflow.evaluate()` will also accepts a list input. One notable requirement is that the each
-        list element i.e. input string needs to be wrapped in another list, so they can be passed as a single prediction request to the model endpoint.
+      - The :py:func:`mlflow.evaluate()` also accepts a list input.
 
-    * - A (nested) list of input strings.
+    * - A list of request payload (dictionary).
       - 
         .. code-block:: python
 
             [
-                [
-                    {
-                        "messages": [
-                            {"role": "system", "content": "Please answer."},
-                            {"role": "user", "content": "What is MLflow?"},
-                        ],
-                        "max_tokens": 100,
-                    },
-                    # ... more dictionary records
-                ]
+                {
+                    "messages": [
+                        {"role": "system", "content": "Please answer."},
+                        {"role": "user", "content": "What is MLflow?"},
+                    ],
+                    "max_tokens": 100,
+                },
+                # ... more dictionary records
             ]
 
-      - Similar requirements as the above list format apply here as well.
+      - Similarly to Pandas DataFrame input, the dictionary should have the correct request format for your model endpoint.
 
 
 
@@ -597,7 +594,7 @@ You can pass additional inference parameters such as ``max_tokens``, ``temperatu
 
 .. note::
 
-    When your input is a dictionary format tha represents request payload, it can also include the parameters like ``max_tokens``. If there are overlapping parameters in both the ``inference_params`` and the input data, the values in the ``inference_params`` will take precedence.
+    When your input is a dictionary format that represents request payload, it can also include the parameters like ``max_tokens``. If there are overlapping parameters in both the ``inference_params`` and the input data, the values in the ``inference_params`` will take precedence.
 
 Examples
 ********

diff --git a/mlflow/metrics/genai/genai_metric.py b/mlflow/metrics/genai/genai_metric.py
@@ -275,8 +275,8 @@ def eval_fn(
         if not isinstance(eval_model, str):
             raise MlflowException(
                 message="The model argument must be a string URI referring to an openai model "
-                "(openai:/gpt-3.5-turbo) or MLflow deployment endpoint (endpoint:/my-route), "
-                f"passed {eval_model} instead",
+                "(openai:/gpt-3.5-turbo) or an MLflow Deployment endpoint "
+                f"(endpoints:/my-endpoint), passed {eval_model} instead",
                 error_code=INVALID_PARAMETER_VALUE,
             )
 

diff --git a/mlflow/metrics/genai/model_utils.py b/mlflow/metrics/genai/model_utils.py
@@ -120,7 +120,7 @@ def _call_deployments_api(deployment_uri, payload, eval_parameters, wrap_payload
         deployment_uri: The URI of the deployment endpoint.
         payload: The input payload to send to the endpoint.
         eval_parameters: The evaluation parameters to send to the endpoint.
-        construct_payload: Whether to wrap the payload in a expected key by the endpoint,
+        wrap_payload: Whether to wrap the payload in a expected key by the endpoint,
             e.g. "prompt" for completions or "messages" for chat. If False, the specified
             payload is directly sent to the endpoint combined with the eval_parameters.
     Returns:

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
@@ -429,7 +429,7 @@ def _is_empty_list_or_array(data):
     if isinstance(data, list):
         return len(data) == 0
     elif isinstance(data, np.ndarray):
-        return data.size == 0 or data.ndim == 0
+        return data.size == 0
     return False
 
 
@@ -459,6 +459,8 @@ def _hash_array_of_dict_as_bytes(data):
             result += _hash_array_of_dict_as_bytes(elm)
         elif isinstance(elm, dict):
             result += _hash_dict_as_bytes(elm)
+        else:
+            result += _hash_data_as_bytes(elm)
     return result
 
 
@@ -1181,6 +1183,10 @@ def _convert_data_to_mlflow_dataset(data, targets=None, predictions=None):
         from pyspark.sql import DataFrame as SparkDataFrame
 
     if isinstance(data, list):
+        # If the list is flat, we assume each element is an independent sample.
+        if not isinstance(data[0], (list, np.ndarray)):
+            data = [[elm] for elm in data]
+
         return mlflow.data.from_numpy(
             np.array(data), targets=np.array(targets) if targets else None
         )

diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py
@@ -1646,31 +1646,29 @@ def test_is_model_deployment_endpoint_uri(model, is_endpoint_uri):
         ),
         # List of string
         (
-            [[q] for q in _TEST_QUERY_LIST],
+            _TEST_QUERY_LIST,
             None,
             _TEST_GT_LIST,
         ),
         # List of string with feature_names
         (
-            [[q] for q in _TEST_QUERY_LIST],
+            _TEST_QUERY_LIST,
             ["question"],
             _TEST_GT_LIST,
         ),
         # List of string with feature_names and w/o targets
         (
-            [[q] for q in _TEST_QUERY_LIST],
+            _TEST_QUERY_LIST,
             ["question"],
             None,
         ),
         # List of dictionary with feature_names
         (
             [
-                [
-                    {
-                        "messages": [{"content": q, "role": "user"}],
-                        "max_tokens": 10,
-                    }
-                ]
+                {
+                    "messages": [{"content": q, "role": "user"}],
+                    "max_tokens": 10,
+                }
                 for q in _TEST_QUERY_LIST
             ],
             None,
@@ -1693,6 +1691,7 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature
             inference_params={"max_tokens": 10, "temperature": 0.5},
         )
 
+    # Validate the endpoint is called with correct payloads
     call_args_list = mock_deploy_client.return_value.predict.call_args_list
     expected_calls = [
         mock.call(
@@ -1713,11 +1712,17 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature
         ),
     ]
     assert all(call in call_args_list for call in expected_calls)
+
+    # Validate the evaluation metrics
     expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean"}
     if targets:
         expected_metrics_subset.add("exact_match/v1")
     assert expected_metrics_subset.issubset(set(eval_result.metrics.keys()))
 
+    # Validate the model output is passed to the evaluator in the correct format (string)
+    eval_results_table = eval_result.tables["eval_results_table"]
+    assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
+
 
 _DUMMY_COMPLETION_RESPONSE = {
     "id": "1",
@@ -1736,26 +1741,11 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature
 @pytest.mark.parametrize(
     ("input_data", "feature_names"),
     [
-        (
-            pd.DataFrame({"inputs": _TEST_QUERY_LIST}),
-            None,
-        ),
-        (
-            pd.DataFrame({"question": _TEST_QUERY_LIST}),
-            ["question"],
-        ),
-        (
-            pd.DataFrame({"inputs": [{"prompt": q} for q in _TEST_QUERY_LIST]}),
-            None,
-        ),
-        (
-            [[q] for q in _TEST_QUERY_LIST],
-            None,
-        ),
-        (
-            [[{"prompt": q}] for q in _TEST_QUERY_LIST],
-            None,
-        ),
+        (pd.DataFrame({"inputs": _TEST_QUERY_LIST}), None),
+        (pd.DataFrame({"question": _TEST_QUERY_LIST}), ["question"]),
+        (pd.DataFrame({"inputs": [{"prompt": q} for q in _TEST_QUERY_LIST]}), None),
+        (_TEST_QUERY_LIST, None),
+        ([{"prompt": q} for q in _TEST_QUERY_LIST], None),
     ],
 )
 @mock.patch("mlflow.deployments.get_deploy_client")
@@ -1772,19 +1762,26 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f
             feature_names=feature_names,
         )
 
+    # Validate the endpoint is called with correct payloads
     call_args_list = mock_deploy_client.return_value.predict.call_args_list
     expected_calls = [
         mock.call(endpoint="completions", inputs={"prompt": "What is MLflow?", "max_tokens": 10}),
         mock.call(endpoint="completions", inputs={"prompt": "What is Spark?", "max_tokens": 10}),
     ]
     assert all(call in call_args_list for call in expected_calls)
+
+    # Validate the evaluation metrics
     expected_metrics_subset = {
         "toxicity/v1/ratio",
         "ari_grade_level/v1/mean",
         "flesch_kincaid_grade_level/v1/mean",
     }
     assert expected_metrics_subset.issubset(set(eval_result.metrics.keys()))
 
+    # Validate the model output is passed to the evaluator in the correct format (string)
+    eval_results_table = eval_result.tables["eval_results_table"]
+    assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
+
 
 @pytest.mark.parametrize(
     ("input_data", "error_message"),