Skip to content

Commit

Permalink
update docs
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Jan 12, 2024
1 parent ffee6e0 commit da71bf0
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 23 deletions.
1 change: 1 addition & 0 deletions docs/flow/modin/experimental/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ Experimental API Reference
.. autofunction:: read_custom_text
.. autofunction:: read_pickle_distributed
.. automethod:: modin.pandas.DataFrame.modin::to_pickle_distributed
.. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob
2 changes: 2 additions & 0 deletions docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,8 @@ default to pandas.
| | | | ``path`` parameter specifies a directory where one |
| | | | file is written per row partition of the Modin |
| | | | dataframe. |
| | | | Experimental implementation: |
| | | | DataFrame.modin.to_parquet_glob |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``to_period`` | `to_period`_ | D | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
6 changes: 4 additions & 2 deletions docs/usage_guide/advanced_usage/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ Modin also supports these experimental APIs on top of pandas that are under acti
- :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory
- :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection
- :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file
- :py:func:`~modin.experimental.pandas.read_pickle_distributed` -- read multiple files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple files in a directory
- :py:func:`~modin.experimental.pandas.read_pickle_distributed` -- read multiple pickle files in a directory
- :py:func:`~modin.experimental.pandas.read_parquet_glob` -- read multiple parquet files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple pickle files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_parquet_glob` -- write to multiple parquet files in a directory

DataFrame partitioning API
--------------------------
Expand Down
4 changes: 2 additions & 2 deletions modin/core/execution/dispatching/factories/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,12 +297,12 @@ def to_pickle_distributed(cls, *args, **kwargs):
return cls.get_factory()._to_pickle_distributed(*args, **kwargs)

@classmethod
# @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
@_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
def read_parquet_glob(cls, *args, **kwargs):
return cls.get_factory()._read_parquet_glob(*args, **kwargs)

Check warning on line 302 in modin/core/execution/dispatching/factories/dispatcher.py

View check run for this annotation

Codecov / codecov/patch

modin/core/execution/dispatching/factories/dispatcher.py#L302

Added line #L302 was not covered by tests

@classmethod
# @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob)
@_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob)
def to_parquet_glob(cls, *args, **kwargs):
return cls.get_factory()._to_parquet_glob(*args, **kwargs)

Check warning on line 307 in modin/core/execution/dispatching/factories/dispatcher.py

View check run for this annotation

Codecov / codecov/patch

modin/core/execution/dispatching/factories/dispatcher.py#L307

Added line #L307 was not covered by tests

Expand Down
23 changes: 17 additions & 6 deletions modin/core/execution/dispatching/factories/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,20 +517,31 @@ def _to_pickle_distributed(cls, *args, **kwargs):
return cls.io_cls.to_pickle_distributed(*args, **kwargs)

@classmethod
# @_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
def _read_parquet_glob(cls, *args, **kwargs):
# TODO: add docstring
@doc(
_doc_io_method_raw_template,
source="Parquet files",
params=_doc_io_method_kwargs_params,
)
def _read_parquet_glob(cls, **kwargs):
current_execution = get_current_execution()
if current_execution not in supported_executions:

Check warning on line 527 in modin/core/execution/dispatching/factories/factories.py

View check run for this annotation

Codecov / codecov/patch

modin/core/execution/dispatching/factories/factories.py#L526-L527

Added lines #L526 - L527 were not covered by tests
raise NotImplementedError(
f"`_read_parquet_glob()` is not implemented for {current_execution} execution."
)
return cls.io_cls.read_parquet_glob(*args, **kwargs)
return cls.io_cls.read_parquet_glob(**kwargs)

Check warning on line 531 in modin/core/execution/dispatching/factories/factories.py

View check run for this annotation

Codecov / codecov/patch

modin/core/execution/dispatching/factories/factories.py#L531

Added line #L531 was not covered by tests

@classmethod
# @_inherit_docstrings(factories.PandasOnRayFactory._to_parquet_glob)
def _to_parquet_glob(cls, *args, **kwargs):
# TODO: add docstring
"""
Write query compiler content to several parquet files.
Parameters
----------
*args : args
Arguments to pass to the writer method.
**kwargs : kwargs
Arguments to pass to the writer method.
"""
current_execution = get_current_execution()
if current_execution not in supported_executions:

Check warning on line 546 in modin/core/execution/dispatching/factories/factories.py

View check run for this annotation

Codecov / codecov/patch

modin/core/execution/dispatching/factories/factories.py#L545-L546

Added lines #L545 - L546 were not covered by tests
raise NotImplementedError(
Expand Down
3 changes: 2 additions & 1 deletion modin/experimental/core/io/glob/glob_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def write(cls, qc, **kwargs):
cls.base_write(qc, filepath_or_buffer, **kwargs)

Check warning on line 122 in modin/experimental/core/io/glob/glob_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

modin/experimental/core/io/glob/glob_dispatcher.py#L122

Added line #L122 was not covered by tests
return

# just to try
# Be careful, this is a kind of limitation, but at the time of the first implementation,
# getting a name in this way is quite convenient.
write_func_name = cls.base_write.__name__

Check warning on line 127 in modin/experimental/core/io/glob/glob_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

modin/experimental/core/io/glob/glob_dispatcher.py#L127

Added line #L127 was not covered by tests

def func(df, **kw): # pragma: no cover
Expand Down
39 changes: 32 additions & 7 deletions modin/experimental/pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def to_pickle_distributed(
compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: StorageOptions = None,
):
) -> None:
"""
Pickle (serialize) object to file.
Expand All @@ -363,7 +363,7 @@ def to_pickle_distributed(
Parameters
----------
filepath_or_buffer : str, path object or file-like object
filepath_or_buffer : str
File path where the pickled object will be stored.
compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
A string representing the compression to use in the output file. By
Expand Down Expand Up @@ -412,8 +412,23 @@ def read_parquet_glob(
filesystem=None,
filters=None,
**kwargs,
):
# TODO: add docstring
) -> DataFrame: # noqa: PR01
"""
Load a parquet object from the file path, returning a DataFrame.
This experimental feature provides parallel reading from multiple parquet files which are
defined by glob pattern. The files must contain parts of one dataframe, which can be
obtained, for example, by `DataFrame.modin.to_parquet_glob` function.
Returns
-------
DataFrame
Notes
-----
* Only string type supported for `path` argument.
* The rest of the arguments are the same as for `pandas.read_parquet`.
"""
from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

Check warning on line 432 in modin/experimental/pandas/io.py

View check run for this annotation

Codecov / codecov/patch

modin/experimental/pandas/io.py#L432

Added line #L432 was not covered by tests

return DataFrame(

Check warning on line 434 in modin/experimental/pandas/io.py

View check run for this annotation

Codecov / codecov/patch

modin/experimental/pandas/io.py#L434

Added line #L434 was not covered by tests
Expand All @@ -434,15 +449,25 @@ def read_parquet_glob(
@expanduser_path_arg("path")
def to_parquet_glob(

Check warning on line 450 in modin/experimental/pandas/io.py

View check run for this annotation

Codecov / codecov/patch

modin/experimental/pandas/io.py#L449-L450

Added lines #L449 - L450 were not covered by tests
self,
path=None,
path,
engine="auto",
compression="snappy",
index=None,
partition_cols=None,
storage_options: StorageOptions = None,
**kwargs,
):
# TODO: add docstring
) -> None:
"""
Write a DataFrame to the binary parquet format.
This experimental feature provides parallel writing into multiple pickle files which are
defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used.
Notes
-----
* Only string type supported for `path` argument.
* The rest of the arguments are the same as for `pandas.to_parquet`.
"""
obj = self
from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

Check warning on line 472 in modin/experimental/pandas/io.py

View check run for this annotation

Codecov / codecov/patch

modin/experimental/pandas/io.py#L471-L472

Added lines #L471 - L472 were not covered by tests

Expand Down
30 changes: 25 additions & 5 deletions modin/pandas/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def to_pickle_distributed(
compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: StorageOptions = None,
):
) -> None:
"""
Pickle (serialize) object to file.
Expand All @@ -224,7 +224,7 @@ def to_pickle_distributed(
Parameters
----------
filepath_or_buffer : str, path object or file-like object
filepath_or_buffer : str
File path where the pickled object will be stored.
compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
A string representing the compression to use in the output file. By
Expand Down Expand Up @@ -260,17 +260,37 @@ def to_pickle_distributed(

def to_parquet_glob(
self,
path=None,
path,
engine="auto",
compression="snappy",
index=None,
partition_cols=None,
storage_options: StorageOptions = None,
**kwargs,
):
# TODO: add docstring
) -> None: # noqa: PR01
"""
Load a parquet object from the file path, returning a DataFrame.
This experimental feature provides parallel reading from multiple parquet files which are
defined by glob pattern. The files must contain parts of one dataframe, which can be
obtained, for example, by `DataFrame.modin.to_parquet_glob` function.
Returns
-------
DataFrame
Notes
-----
* Only string type supported for `path` argument.
* The rest of the arguments are the same as for `pandas.read_parquet`.
"""
from modin.experimental.pandas.io import to_parquet_glob

Check warning on line 287 in modin/pandas/accessor.py

View check run for this annotation

Codecov / codecov/patch

modin/pandas/accessor.py#L287

Added line #L287 was not covered by tests

if path is None:

Check warning on line 289 in modin/pandas/accessor.py

View check run for this annotation

Codecov / codecov/patch

modin/pandas/accessor.py#L289

Added line #L289 was not covered by tests
raise NotImplementedError(
"`to_parquet_glob` doesn't support path=None, use `to_parquet` in that case."
)

to_parquet_glob(

Check warning on line 294 in modin/pandas/accessor.py

View check run for this annotation

Codecov / codecov/patch

modin/pandas/accessor.py#L294

Added line #L294 was not covered by tests
self._data,
path=path,
Expand Down

0 comments on commit da71bf0

Please sign in to comment.