update docs

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
modin-project · Jan 12, 2024 · da71bf0 · da71bf0
1 parent ffee6e0
commit da71bf0
Show file tree

Hide file tree

Showing 8 changed files with 85 additions and 23 deletions.
diff --git a/docs/flow/modin/experimental/pandas.rst b/docs/flow/modin/experimental/pandas.rst
@@ -14,3 +14,4 @@ Experimental API Reference
 .. autofunction:: read_custom_text
 .. autofunction:: read_pickle_distributed
 .. automethod:: modin.pandas.DataFrame.modin::to_pickle_distributed
+.. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob
diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst
@@ -421,6 +421,8 @@ default to pandas.
 |                            |                           |                        | ``path`` parameter specifies a directory where one |
 |                            |                           |                        | file is written per row partition of the Modin     |
 |                            |                           |                        | dataframe.                                         |
+|                            |                           |                        | Experimental implementation:                       |
+|                            |                           |                        | DataFrame.modin.to_parquet_glob                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``to_period``              | `to_period`_              | D                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+

diff --git a/docs/usage_guide/advanced_usage/index.rst b/docs/usage_guide/advanced_usage/index.rst
@@ -30,8 +30,10 @@ Modin also supports these experimental APIs on top of pandas that are under acti
 - :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory
 - :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection
 - :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file
-- :py:func:`~modin.experimental.pandas.read_pickle_distributed`  -- read multiple files in a directory
-- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple files in a directory
+- :py:func:`~modin.experimental.pandas.read_pickle_distributed`  -- read multiple pickle files in a directory
+- :py:func:`~modin.experimental.pandas.read_parquet_glob`  -- read multiple parquet files in a directory
+- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple pickle files in a directory
+- :py:meth:`~modin.pandas.DataFrame.modin.to_parquet_glob` -- write to multiple parquet files in a directory
 
 DataFrame partitioning API
 --------------------------

diff --git a/modin/core/execution/dispatching/factories/dispatcher.py b/modin/core/execution/dispatching/factories/dispatcher.py
@@ -297,12 +297,12 @@ def to_pickle_distributed(cls, *args, **kwargs):
         return cls.get_factory()._to_pickle_distributed(*args, **kwargs)
 
     @classmethod
-    # @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
+    @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
     def read_parquet_glob(cls, *args, **kwargs):
         return cls.get_factory()._read_parquet_glob(*args, **kwargs)
 
     @classmethod
-    # @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob)
+    @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob)
     def to_parquet_glob(cls, *args, **kwargs):
         return cls.get_factory()._to_parquet_glob(*args, **kwargs)
 

diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py
@@ -517,20 +517,31 @@ def _to_pickle_distributed(cls, *args, **kwargs):
         return cls.io_cls.to_pickle_distributed(*args, **kwargs)
 
     @classmethod
-    # @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
-    def _read_parquet_glob(cls, *args, **kwargs):
-        # TODO: add docstring
+    @doc(
+        _doc_io_method_raw_template,
+        source="Parquet files",
+        params=_doc_io_method_kwargs_params,
+    )
+    def _read_parquet_glob(cls, **kwargs):
         current_execution = get_current_execution()
         if current_execution not in supported_executions:
             raise NotImplementedError(
                 f"`_read_parquet_glob()` is not implemented for {current_execution} execution."
             )
-        return cls.io_cls.read_parquet_glob(*args, **kwargs)
+        return cls.io_cls.read_parquet_glob(**kwargs)
 
     @classmethod
-    # @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob)
     def _to_parquet_glob(cls, *args, **kwargs):
-        # TODO: add docstring
+        """
+        Write query compiler content to several parquet files.
+
+        Parameters
+        ----------
+        *args : args
+            Arguments to pass to the writer method.
+        **kwargs : kwargs
+            Arguments to pass to the writer method.
+        """
         current_execution = get_current_execution()
         if current_execution not in supported_executions:
             raise NotImplementedError(

diff --git a/modin/experimental/core/io/glob/glob_dispatcher.py b/modin/experimental/core/io/glob/glob_dispatcher.py
@@ -122,7 +122,8 @@ def write(cls, qc, **kwargs):
             cls.base_write(qc, filepath_or_buffer, **kwargs)
             return
 
-        # just to try
+        # Be careful, this is a kind of limitation, but at the time of the first implementation,
+        # getting a name in this way is quite convenient.
         write_func_name = cls.base_write.__name__
 
         def func(df, **kw):  # pragma: no cover

diff --git a/modin/experimental/pandas/io.py b/modin/experimental/pandas/io.py
@@ -354,7 +354,7 @@ def to_pickle_distributed(
     compression: CompressionOptions = "infer",
     protocol: int = pickle.HIGHEST_PROTOCOL,
     storage_options: StorageOptions = None,
-):
+) -> None:
     """
     Pickle (serialize) object to file.
 
@@ -363,7 +363,7 @@ def to_pickle_distributed(
 
     Parameters
     ----------
-    filepath_or_buffer : str, path object or file-like object
+    filepath_or_buffer : str
         File path where the pickled object will be stored.
     compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
         A string representing the compression to use in the output file. By
@@ -412,8 +412,23 @@ def read_parquet_glob(
     filesystem=None,
     filters=None,
     **kwargs,
-):
-    # TODO: add docstring
+) -> DataFrame:  # noqa: PR01
+    """
+    Load a parquet object from the file path, returning a DataFrame.
+
+    This experimental feature provides parallel reading from multiple parquet files which are
+    defined by glob pattern. The files must contain parts of one dataframe, which can be
+    obtained, for example, by `DataFrame.modin.to_parquet_glob` function.
+
+    Returns
+    -------
+    DataFrame
+
+    Notes
+    -----
+    * Only string type supported for `path` argument.
+    * The rest of the arguments are the same as for `pandas.read_parquet`.
+    """
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
 
     return DataFrame(
@@ -434,15 +449,25 @@ def read_parquet_glob(
 @expanduser_path_arg("path")
 def to_parquet_glob(
     self,
-    path=None,
+    path,
     engine="auto",
     compression="snappy",
     index=None,
     partition_cols=None,
     storage_options: StorageOptions = None,
     **kwargs,
-):
-    # TODO: add docstring
+) -> None:
+    """
+    Write a DataFrame to the binary parquet format.
+
+    This experimental feature provides parallel writing into multiple pickle files which are
+    defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used.
+
+    Notes
+    -----
+    * Only string type supported for `path` argument.
+    * The rest of the arguments are the same as for `pandas.to_parquet`.
+    """
     obj = self
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
 

diff --git a/modin/pandas/accessor.py b/modin/pandas/accessor.py
@@ -215,7 +215,7 @@ def to_pickle_distributed(
         compression: CompressionOptions = "infer",
         protocol: int = pickle.HIGHEST_PROTOCOL,
         storage_options: StorageOptions = None,
-    ):
+    ) -> None:
         """
         Pickle (serialize) object to file.
 
@@ -224,7 +224,7 @@ def to_pickle_distributed(
 
         Parameters
         ----------
-        filepath_or_buffer : str, path object or file-like object
+        filepath_or_buffer : str
             File path where the pickled object will be stored.
         compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
             A string representing the compression to use in the output file. By
@@ -260,17 +260,37 @@ def to_pickle_distributed(
 
     def to_parquet_glob(
         self,
-        path=None,
+        path,
         engine="auto",
         compression="snappy",
         index=None,
         partition_cols=None,
         storage_options: StorageOptions = None,
         **kwargs,
-    ):
-        # TODO: add docstring
+    ) -> None:  # noqa: PR01
+        """
+        Load a parquet object from the file path, returning a DataFrame.
+
+        This experimental feature provides parallel reading from multiple parquet files which are
+        defined by glob pattern. The files must contain parts of one dataframe, which can be
+        obtained, for example, by `DataFrame.modin.to_parquet_glob` function.
+
+        Returns
+        -------
+        DataFrame
+
+        Notes
+        -----
+        * Only string type supported for `path` argument.
+        * The rest of the arguments are the same as for `pandas.read_parquet`.
+        """
         from modin.experimental.pandas.io import to_parquet_glob
 
+        if path is None:
+            raise NotImplementedError(
+                "`to_parquet_glob` doesn't support path=None, use `to_parquet` in that case."
+            )
+
         to_parquet_glob(
             self._data,
             path=path,