Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google docstring conversion batch 7 #10888

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion mlflow/langchain/__init__.py
Expand Up @@ -319,7 +319,7 @@ def log_model(
Args:
lc_model: A LangChain model, which could be a
`Chain <https://python.langchain.com/docs/modules/chains/>`_,
`Agent <https://python.langchain.com/docs/modules/agents/>`, or
`Agent <https://python.langchain.com/docs/modules/agents/>`_, or
`retriever <https://python.langchain.com/docs/modules/data_connection/retrievers/>`_.
artifact_path: Run-relative artifact path.
conda_env: {{ conda_env }}
Expand Down
20 changes: 13 additions & 7 deletions mlflow/langchain/retriever_chain.py
Expand Up @@ -30,7 +30,8 @@ class _RetrieverChain(Chain):

See :ref:`log-retriever-chain` for how to log the ``_RetrieverChain``.

:param retriever: The retriever to wrap.
Args:
retriever: The retriever to wrap.
"""

input_key: str = "query"
Expand Down Expand Up @@ -65,10 +66,12 @@ def _call(
"""Run _get_docs on input query.
Returns the retrieved documents under the key 'source_documents'.
Example:

.. code-block:: python
chain = _RetrieverChain(retriever=...)
res = chain({'query': 'This is my query'})
docs = res['source_documents']

chain = _RetrieverChain(retriever=...)
res = chain({"query": "This is my query"})
docs = res["source_documents"]
"""
question = inputs[self.input_key]
docs = self._get_docs(question)
Expand All @@ -86,11 +89,14 @@ async def _acall(
) -> Dict[str, Any]:
"""Run _get_docs on input query.
Returns the retrieved documents under the key 'source_documents'.

Example:

.. code-block:: python
chain = _RetrieverChain(retriever=...)
res = chain({'query': 'This is my query'})
docs = res['source_documents']

chain = _RetrieverChain(retriever=...)
res = chain({"query": "This is my query"})
docs = res["source_documents"]
"""
question = inputs[self.input_key]
docs = await self._aget_docs(question)
Expand Down
2 changes: 1 addition & 1 deletion mlflow/langchain/runnables.py
Expand Up @@ -297,7 +297,7 @@ def _save_runnable_with_steps(steps, file_path: Union[Path, str], loader_fn=None

def _save_runnable_branch(model, file_path, loader_fn, persist_dir):
"""
save runnable branch in to path.
Save runnable branch in to path.
"""
save_path = Path(file_path) if isinstance(file_path, str) else file_path
save_path.mkdir(parents=True, exist_ok=True)
Expand Down
234 changes: 125 additions & 109 deletions mlflow/lightgbm/__init__.py

Large diffs are not rendered by default.

300 changes: 155 additions & 145 deletions mlflow/onnx/__init__.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions mlflow/pmdarima/__init__.py
Expand Up @@ -415,11 +415,13 @@ def load_model(model_uri, dst_path=None):

Args:
model_uri: The location, in URI format, of the MLflow model. For example:

- ``/Users/me/path/to/local/model``
- ``relative/path/to/local/model``
- ``s3://my_bucket/path/to/model``
- ``runs:/<mlflow_run_id>/run-relative/path/to/model``
- ``mlflow-artifacts:/path/to/model``

For more information about supported URI schemes, see
`Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html#
artifact-locations>`_.
Expand Down
2 changes: 2 additions & 0 deletions mlflow/prophet/__init__.py
Expand Up @@ -321,10 +321,12 @@ def load_model(model_uri, dst_path=None):

Args:
model_uri: The location, in URI format, of the MLflow model. For example:

- ``/Users/me/path/to/local/model``
- ``relative/path/to/local/model``
- ``s3://my_bucket/path/to/model``
- ``runs:/<mlflow_run_id>/run-relative/path/to/model``

For more information about supported URI schemes, see
`Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html#
artifact-locations>`_.
Expand Down
105 changes: 54 additions & 51 deletions mlflow/pyspark/ml/__init__.py
Expand Up @@ -184,8 +184,9 @@ def _should_log_model(spark_model):

def _get_estimator_info_tags(estimator):
"""
:return: A dictionary of MLflow run tag keys and values
describing the specified estimator.
Returns:
A dictionary of MLflow run tag keys and values
describing the specified estimator.
"""
return {
"estimator_name": estimator.__class__.__name__,
Expand Down Expand Up @@ -870,55 +871,57 @@ def autolog(
.. _TrainValidationSplit:
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.tuning.TrainValidationSplit.html#pyspark.ml.tuning.TrainValidationSplit

:param log_models: If ``True``, if trained models are in allowlist, they are logged as MLflow
model artifacts. If ``False``, trained models are not logged.
Note: the built-in allowlist excludes some models (e.g. ALS models) which
can be large. To specify a custom allowlist, create a file containing a
newline-delimited list of fully-qualified estimator classnames, and set
the "spark.mlflow.pysparkml.autolog.logModelAllowlistFile" Spark config
to the path of your allowlist file.
:param log_datasets: If ``True``, dataset information is logged to MLflow Tracking.
If ``False``, dataset information is not logged.
:param disable: If ``True``, disables the scikit-learn autologging integration. If ``False``,
enables the pyspark ML autologging integration.
:param exclusive: If ``True``, autologged content is not logged to user-created fluent runs.
If ``False``, autologged content is logged to the active fluent run,
which may be user-created.
:param disable_for_unsupported_versions: If ``True``, disable autologging for versions of
pyspark that have not been tested against this version of the MLflow
client or are incompatible.
:param silent: If ``True``, suppress all event logs and warnings from MLflow during pyspark ML
autologging. If ``False``, show all events and warnings during pyspark ML
autologging.
:param log_post_training_metrics: If ``True``, post training metrics are logged. Defaults to
``True``. See the `post training metrics`_ section for more
details.
:param registered_model_name: If given, each time a model is trained, it is registered as a
new model version of the registered model with this name.
The registered model is created if it does not already exist.
:param log_input_examples: If ``True``, input examples from training datasets are collected and
logged along with pyspark ml model artifacts during training. If
``False``, input examples are not logged.
:param log_model_signatures: If ``True``,
:py:class:`ModelSignatures <mlflow.models.ModelSignature>`
describing model inputs and outputs are collected and logged along
with spark ml pipeline/estimator artifacts during training.
If ``False`` signatures are not logged.

.. warning::

Currently, only scalar Spark data types are supported. If
model inputs/outputs contain non-scalar Spark data types such
as ``pyspark.ml.linalg.Vector``, signatures are not logged.
:param log_model_allowlist: If given, it overrides the default log model allowlist in mlflow.
This takes precedence over the spark configuration of
"spark.mlflow.pysparkml.autolog.logModelAllowlistFile".

**The default log model allowlist in mlflow**
.. literalinclude:: ../../../mlflow/pyspark/ml/log_model_allowlist.txt
:language: text

:param extra_tags: A dictionary of extra tags to set on each managed run created by autologging.
Args:
log_models: If ``True``, if trained models are in allowlist, they are logged as MLflow
model artifacts. If ``False``, trained models are not logged.
Note: the built-in allowlist excludes some models (e.g. ALS models) which
can be large. To specify a custom allowlist, create a file containing a
newline-delimited list of fully-qualified estimator classnames, and set
the "spark.mlflow.pysparkml.autolog.logModelAllowlistFile" Spark config
to the path of your allowlist file.
log_datasets: If ``True``, dataset information is logged to MLflow Tracking.
If ``False``, dataset information is not logged.
disable: If ``True``, disables the scikit-learn autologging integration. If ``False``,
enables the pyspark ML autologging integration.
exclusive: If ``True``, autologged content is not logged to user-created fluent runs.
If ``False``, autologged content is logged to the active fluent run,
which may be user-created.
disable_for_unsupported_versions: If ``True``, disable autologging for versions of
pyspark that have not been tested against this version of the MLflow
client or are incompatible.
silent: If ``True``, suppress all event logs and warnings from MLflow during pyspark ML
autologging. If ``False``, show all events and warnings during pyspark ML
autologging.
log_post_training_metrics: If ``True``, post training metrics are logged. Defaults to
``True``. See the `post training metrics`_ section for more
details.
registered_model_name: If given, each time a model is trained, it is registered as a
new model version of the registered model with this name.
The registered model is created if it does not already exist.
log_input_examples: If ``True``, input examples from training datasets are collected and
logged along with pyspark ml model artifacts during training. If
``False``, input examples are not logged.
log_model_signatures: If ``True``,
:py:class:`ModelSignatures <mlflow.models.ModelSignature>`
describing model inputs and outputs are collected and logged along
with spark ml pipeline/estimator artifacts during training.
If ``False`` signatures are not logged.

.. warning::

Currently, only scalar Spark data types are supported. If
model inputs/outputs contain non-scalar Spark data types such
as ``pyspark.ml.linalg.Vector``, signatures are not logged.

log_model_allowlist: If given, it overrides the default log model allowlist in mlflow.
This takes precedence over the spark configuration of
"spark.mlflow.pysparkml.autolog.logModelAllowlistFile".

**The default log model allowlist in mlflow**
.. literalinclude:: ../../../mlflow/pyspark/ml/log_model_allowlist.txt
:language: text

extra_tags: A dictionary of extra tags to set on each managed run created by autologging.
"""
from pyspark.ml.base import Estimator, Model
from pyspark.ml.evaluation import Evaluator
Expand Down
37 changes: 24 additions & 13 deletions mlflow/pyspark/ml/_autolog.py
Expand Up @@ -20,8 +20,11 @@ def cast_spark_df_with_vector_to_array(input_spark_df):
Finds columns of vector type in a spark dataframe and
casts them to array<double> type.

:param input_spark_df:
:return: a spark dataframe with vector columns transformed to array<double> type
Args:
input_spark_df:

Returns:
A spark dataframe with vector columns transformed to array<double> type
"""
vector_type_columns = [
_field.name for _field in input_spark_df.schema if isinstance(_field.dataType, VectorUDT)
Expand All @@ -37,20 +40,25 @@ def _do_pipeline_transform(df: DataFrame, transformer: Union[Transformer, Pipeli
"""
A util method that runs transform on a pipeline model/transformer

:param df:a spark dataframe
:return: output transformed dataframe using pipeline model/transformer
Args:
df: a spark dataframe

Returns:
output transformed dataframe using pipeline model/transformer
"""
return transformer.transform(df)


def _get_struct_type_by_cols(input_fields: Set[str], df_schema: t.StructType) -> t.StructType:
"""
Args:
input_fields: A set of input columns to be
intersected with the input dataset's columns.
df_schema: A Spark dataframe schema to compare input_fields

:param input_fields: A set of input columns to be
intersected with the input dataset's columns.
:param df_schema: A Spark dataframe schema to compare input_fields
:return:A StructType from the intersection of given columns and
the columns present in the training dataset
Returns:
A StructType from the intersection of given columns and
the columns present in the training dataset
"""
if len(input_fields) > 0:
return t.StructType([_field for _field in df_schema.fields if _field.name in input_fields])
Expand All @@ -67,10 +75,13 @@ def get_feature_cols(
if `input_fields` is set to include non-feature columns those
will be included in the return set of column names.

:param df: An input spark dataframe.
:param transformer: A pipeline/transformer to get the required feature columns
:return: A set of all the feature columns that are required
for the pipeline/transformer plus any initial columns passed in.
Args:
df: An input spark dataframe.
transformer: A pipeline/transformer to get the required feature columns

Returns:
A set of all the feature columns that are required
for the pipeline/transformer plus any initial columns passed in.
"""
feature_cols = set()
df_subset = df.limit(1).cache()
Expand Down