diff --git a/docs/source/deployment/deploy-model-locally.rst b/docs/source/deployment/deploy-model-locally.rst index 4e2dc55b82c58..2e56eda17974d 100644 --- a/docs/source/deployment/deploy-model-locally.rst +++ b/docs/source/deployment/deploy-model-locally.rst @@ -104,7 +104,7 @@ the required payload format, you can leverage the dict payload structures below. - Pandas DataFrame in the ``records`` orientation. **We do not recommend using this format because it is not guaranteed to preserve column ordering.** - .. code-block:: python - + {"dataframe_records": pandas_df.to_dict(orient="records")} * - ``instances`` diff --git a/docs/source/llms/llm-evaluate/index.rst b/docs/source/llms/llm-evaluate/index.rst index c2ed14227dc5b..9969645b53810 100644 --- a/docs/source/llms/llm-evaluate/index.rst +++ b/docs/source/llms/llm-evaluate/index.rst @@ -509,7 +509,7 @@ Please don't forget to set the target deployment client by using :py:func:`mlflo .. hint:: - When you want to use an external endpoint **not** hosted by an MLflow Deployments Server or Databricks, you can create a custom Python function following the :ref:`Evaluating with a Custom Function ` guide and use it as the ``model`` argument. + When you want to use an endpoint **not** hosted by an MLflow Deployments Server or Databricks, you can create a custom Python function following the :ref:`Evaluating with a Custom Function ` guide and use it as the ``model`` argument. Supported Input Data Formats **************************** @@ -555,38 +555,35 @@ The input data can be either of the following format when using an URI of the ML ] } - - In this format, the dictionary should have the correct request format for your model endpoint. + - In this format, the dictionary should have the correct request format for your model endpoint. Please refer to the `MLflow Deployments documentation <../deployments/index.html#standard-query-parameters>`_ for more information about the request format for different model endpoint types. - * - A (nested) list of input strings. + * - A list of input strings. - .. code-block:: python [ - ["What is MLflow?"], - ["What is Spark?"], + "What is MLflow?", + "What is Spark?", ] - - The :py:func:`mlflow.evaluate()` will also accepts a list input. One notable requirement is that the each - list element i.e. input string needs to be wrapped in another list, so they can be passed as a single prediction request to the model endpoint. + - The :py:func:`mlflow.evaluate()` also accepts a list input. - * - A (nested) list of input strings. + * - A list of request payload (dictionary). - .. code-block:: python [ - [ - { - "messages": [ - {"role": "system", "content": "Please answer."}, - {"role": "user", "content": "What is MLflow?"}, - ], - "max_tokens": 100, - }, - # ... more dictionary records - ] + { + "messages": [ + {"role": "system", "content": "Please answer."}, + {"role": "user", "content": "What is MLflow?"}, + ], + "max_tokens": 100, + }, + # ... more dictionary records ] - - Similar requirements as the above list format apply here as well. + - Similarly to Pandas DataFrame input, the dictionary should have the correct request format for your model endpoint. @@ -597,7 +594,7 @@ You can pass additional inference parameters such as ``max_tokens``, ``temperatu .. note:: - When your input is a dictionary format tha represents request payload, it can also include the parameters like ``max_tokens``. If there are overlapping parameters in both the ``inference_params`` and the input data, the values in the ``inference_params`` will take precedence. + When your input is a dictionary format that represents request payload, it can also include the parameters like ``max_tokens``. If there are overlapping parameters in both the ``inference_params`` and the input data, the values in the ``inference_params`` will take precedence. Examples ******** diff --git a/mlflow/metrics/genai/genai_metric.py b/mlflow/metrics/genai/genai_metric.py index 0e3d31d12b0ba..0c8c79e4ea16a 100644 --- a/mlflow/metrics/genai/genai_metric.py +++ b/mlflow/metrics/genai/genai_metric.py @@ -275,8 +275,8 @@ def eval_fn( if not isinstance(eval_model, str): raise MlflowException( message="The model argument must be a string URI referring to an openai model " - "(openai:/gpt-3.5-turbo) or MLflow deployment endpoint (endpoint:/my-route), " - f"passed {eval_model} instead", + "(openai:/gpt-3.5-turbo) or an MLflow Deployment endpoint " + f"(endpoints:/my-endpoint), passed {eval_model} instead", error_code=INVALID_PARAMETER_VALUE, ) diff --git a/mlflow/metrics/genai/model_utils.py b/mlflow/metrics/genai/model_utils.py index 350e96daf4201..560464b3933e1 100644 --- a/mlflow/metrics/genai/model_utils.py +++ b/mlflow/metrics/genai/model_utils.py @@ -120,7 +120,7 @@ def _call_deployments_api(deployment_uri, payload, eval_parameters, wrap_payload deployment_uri: The URI of the deployment endpoint. payload: The input payload to send to the endpoint. eval_parameters: The evaluation parameters to send to the endpoint. - construct_payload: Whether to wrap the payload in a expected key by the endpoint, + wrap_payload: Whether to wrap the payload in a expected key by the endpoint, e.g. "prompt" for completions or "messages" for chat. If False, the specified payload is directly sent to the endpoint combined with the eval_parameters. Returns: diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py index b74882da49e60..50a9834494b25 100644 --- a/mlflow/models/evaluation/base.py +++ b/mlflow/models/evaluation/base.py @@ -429,7 +429,7 @@ def _is_empty_list_or_array(data): if isinstance(data, list): return len(data) == 0 elif isinstance(data, np.ndarray): - return data.size == 0 or data.ndim == 0 + return data.size == 0 return False @@ -459,6 +459,8 @@ def _hash_array_of_dict_as_bytes(data): result += _hash_array_of_dict_as_bytes(elm) elif isinstance(elm, dict): result += _hash_dict_as_bytes(elm) + else: + result += _hash_data_as_bytes(elm) return result @@ -1181,6 +1183,10 @@ def _convert_data_to_mlflow_dataset(data, targets=None, predictions=None): from pyspark.sql import DataFrame as SparkDataFrame if isinstance(data, list): + # If the list is flat, we assume each element is an independent sample. + if not isinstance(data[0], (list, np.ndarray)): + data = [[elm] for elm in data] + return mlflow.data.from_numpy( np.array(data), targets=np.array(targets) if targets else None ) diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py index 2f690fae51362..7cce9de9068dd 100644 --- a/tests/evaluate/test_evaluation.py +++ b/tests/evaluate/test_evaluation.py @@ -1646,31 +1646,29 @@ def test_is_model_deployment_endpoint_uri(model, is_endpoint_uri): ), # List of string ( - [[q] for q in _TEST_QUERY_LIST], + _TEST_QUERY_LIST, None, _TEST_GT_LIST, ), # List of string with feature_names ( - [[q] for q in _TEST_QUERY_LIST], + _TEST_QUERY_LIST, ["question"], _TEST_GT_LIST, ), # List of string with feature_names and w/o targets ( - [[q] for q in _TEST_QUERY_LIST], + _TEST_QUERY_LIST, ["question"], None, ), # List of dictionary with feature_names ( [ - [ - { - "messages": [{"content": q, "role": "user"}], - "max_tokens": 10, - } - ] + { + "messages": [{"content": q, "role": "user"}], + "max_tokens": 10, + } for q in _TEST_QUERY_LIST ], None, @@ -1693,6 +1691,7 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature inference_params={"max_tokens": 10, "temperature": 0.5}, ) + # Validate the endpoint is called with correct payloads call_args_list = mock_deploy_client.return_value.predict.call_args_list expected_calls = [ mock.call( @@ -1713,11 +1712,17 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature ), ] assert all(call in call_args_list for call in expected_calls) + + # Validate the evaluation metrics expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean"} if targets: expected_metrics_subset.add("exact_match/v1") assert expected_metrics_subset.issubset(set(eval_result.metrics.keys())) + # Validate the model output is passed to the evaluator in the correct format (string) + eval_results_table = eval_result.tables["eval_results_table"] + assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2)) + _DUMMY_COMPLETION_RESPONSE = { "id": "1", @@ -1736,26 +1741,11 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature @pytest.mark.parametrize( ("input_data", "feature_names"), [ - ( - pd.DataFrame({"inputs": _TEST_QUERY_LIST}), - None, - ), - ( - pd.DataFrame({"question": _TEST_QUERY_LIST}), - ["question"], - ), - ( - pd.DataFrame({"inputs": [{"prompt": q} for q in _TEST_QUERY_LIST]}), - None, - ), - ( - [[q] for q in _TEST_QUERY_LIST], - None, - ), - ( - [[{"prompt": q}] for q in _TEST_QUERY_LIST], - None, - ), + (pd.DataFrame({"inputs": _TEST_QUERY_LIST}), None), + (pd.DataFrame({"question": _TEST_QUERY_LIST}), ["question"]), + (pd.DataFrame({"inputs": [{"prompt": q} for q in _TEST_QUERY_LIST]}), None), + (_TEST_QUERY_LIST, None), + ([{"prompt": q} for q in _TEST_QUERY_LIST], None), ], ) @mock.patch("mlflow.deployments.get_deploy_client") @@ -1772,12 +1762,15 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f feature_names=feature_names, ) + # Validate the endpoint is called with correct payloads call_args_list = mock_deploy_client.return_value.predict.call_args_list expected_calls = [ mock.call(endpoint="completions", inputs={"prompt": "What is MLflow?", "max_tokens": 10}), mock.call(endpoint="completions", inputs={"prompt": "What is Spark?", "max_tokens": 10}), ] assert all(call in call_args_list for call in expected_calls) + + # Validate the evaluation metrics expected_metrics_subset = { "toxicity/v1/ratio", "ari_grade_level/v1/mean", @@ -1785,6 +1778,10 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f } assert expected_metrics_subset.issubset(set(eval_result.metrics.keys())) + # Validate the model output is passed to the evaluator in the correct format (string) + eval_results_table = eval_result.tables["eval_results_table"] + assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2)) + @pytest.mark.parametrize( ("input_data", "error_message"),