Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REFACTOR-#6856: Rename read_pickle_distributed/to_pickle_distributed to read_pickle_glob/to_pickle_glob #6957

Merged
merged 1 commit into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/flow/modin/experimental/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ Experimental API Reference
.. autofunction:: read_sql
.. autofunction:: read_csv_glob
.. autofunction:: read_custom_text
.. autofunction:: read_pickle_distributed
.. autofunction:: read_pickle_glob
.. autofunction:: read_parquet_glob
.. autofunction:: read_json_glob
.. autofunction:: read_xml_glob
.. automethod:: modin.pandas.DataFrame.modin::to_pickle_distributed
.. automethod:: modin.pandas.DataFrame.modin::to_pickle_glob
.. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob
.. automethod:: modin.pandas.DataFrame.modin::to_json_glob
.. automethod:: modin.pandas.DataFrame.modin::to_xml_glob
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ default to pandas.
| ``to_period`` | `to_period`_ | D | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``to_pickle`` | `to_pickle`_ | D | Experimental implementation: |
| | | | DataFrame.modin.to_pickle_distributed |
| | | | DataFrame.modin.to_pickle_glob |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``to_records`` | `to_records`_ | D | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/io_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ default to pandas.
| `read_sas`_ | D | |
+-------------------+---------------------------------+--------------------------------------------------------+
| `read_pickle`_ | D | Experimental implementation: |
| | | read_pickle_distributed |
| | | read_pickle_glob |
+-------------------+---------------------------------+--------------------------------------------------------+
| `read_sql`_ | Y | |
+-------------------+---------------------------------+--------------------------------------------------------+
Expand Down
4 changes: 2 additions & 2 deletions docs/usage_guide/advanced_usage/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ Modin also supports these experimental APIs on top of pandas that are under acti
- :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory
- :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection
- :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file
- :py:func:`~modin.experimental.pandas.read_pickle_distributed` -- read multiple pickle files in a directory
- :py:func:`~modin.experimental.pandas.read_pickle_glob` -- read multiple pickle files in a directory
- :py:func:`~modin.experimental.pandas.read_parquet_glob` -- read multiple parquet files in a directory
- :py:func:`~modin.experimental.pandas.read_json_glob` -- read multiple json files in a directory
- :py:func:`~modin.experimental.pandas.read_xml_glob` -- read multiple xml files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_distributed` -- write to multiple pickle files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_pickle_glob` -- write to multiple pickle files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_parquet_glob` -- write to multiple parquet files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_json_glob` -- write to multiple json files in a directory
- :py:meth:`~modin.pandas.DataFrame.modin.to_xml_glob` -- write to multiple xml files in a directory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ def __make_write(*classes, build_args=build_args):
ExperimentalGlobDispatcher,
build_args={**build_args, "base_write": BaseIO.to_xml},
)
read_pickle_distributed = __make_read(
read_pickle_glob = __make_read(
ExperimentalPandasPickleParser, ExperimentalGlobDispatcher
)
to_pickle_distributed = __make_write(
to_pickle_glob = __make_write(
ExperimentalGlobDispatcher,
build_args={**build_args, "base_write": BaseIO.to_pickle},
)
Expand Down
12 changes: 6 additions & 6 deletions modin/core/execution/dispatching/factories/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,9 @@ def read_csv_glob(cls, **kwargs):
return cls.get_factory()._read_csv_glob(**kwargs)

@classmethod
@_inherit_docstrings(factories.PandasOnRayFactory._read_pickle_distributed)
def read_pickle_distributed(cls, **kwargs):
return cls.get_factory()._read_pickle_distributed(**kwargs)
@_inherit_docstrings(factories.PandasOnRayFactory._read_pickle_glob)
def read_pickle_glob(cls, **kwargs):
return cls.get_factory()._read_pickle_glob(**kwargs)

@classmethod
@_inherit_docstrings(factories.BaseFactory._read_json)
Expand Down Expand Up @@ -292,9 +292,9 @@ def to_pickle(cls, *args, **kwargs):
return cls.get_factory()._to_pickle(*args, **kwargs)

@classmethod
@_inherit_docstrings(factories.PandasOnRayFactory._to_pickle_distributed)
def to_pickle_distributed(cls, *args, **kwargs):
return cls.get_factory()._to_pickle_distributed(*args, **kwargs)
@_inherit_docstrings(factories.PandasOnRayFactory._to_pickle_glob)
def to_pickle_glob(cls, *args, **kwargs):
return cls.get_factory()._to_pickle_glob(*args, **kwargs)

@classmethod
@_inherit_docstrings(factories.PandasOnRayFactory._read_parquet_glob)
Expand Down
12 changes: 6 additions & 6 deletions modin/core/execution/dispatching/factories/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,13 +476,13 @@ def _read_csv_glob(cls, **kwargs):
source="Pickle files",
params=_doc_io_method_kwargs_params,
)
def _read_pickle_distributed(cls, **kwargs):
def _read_pickle_glob(cls, **kwargs):
current_execution = get_current_execution()
if current_execution not in supported_executions:
raise NotImplementedError(
f"`_read_pickle_distributed()` is not implemented for {current_execution} execution."
f"`_read_pickle_glob()` is not implemented for {current_execution} execution."
)
return cls.io_cls.read_pickle_distributed(**kwargs)
return cls.io_cls.read_pickle_glob(**kwargs)

@classmethod
@doc(
Expand Down Expand Up @@ -526,7 +526,7 @@ def _read_custom_text(cls, **kwargs):
return cls.io_cls.read_custom_text(**kwargs)

@classmethod
def _to_pickle_distributed(cls, *args, **kwargs):
def _to_pickle_glob(cls, *args, **kwargs):
"""
Distributed pickle query compiler object.

Expand All @@ -540,9 +540,9 @@ def _to_pickle_distributed(cls, *args, **kwargs):
current_execution = get_current_execution()
if current_execution not in supported_executions:
raise NotImplementedError(
f"`_to_pickle_distributed()` is not implemented for {current_execution} execution."
f"`_to_pickle_glob()` is not implemented for {current_execution} execution."
)
return cls.io_cls.to_pickle_distributed(*args, **kwargs)
return cls.io_cls.to_pickle_glob(*args, **kwargs)

@classmethod
@doc(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ def __make_write(*classes, build_args=build_args):
ExperimentalGlobDispatcher,
build_args={**build_args, "base_write": RayIO.to_xml},
)
read_pickle_distributed = __make_read(
read_pickle_glob = __make_read(
ExperimentalPandasPickleParser, ExperimentalGlobDispatcher
)
to_pickle_distributed = __make_write(
to_pickle_glob = __make_write(
ExperimentalGlobDispatcher,
build_args={**build_args, "base_write": RayIO.to_pickle},
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ def __make_write(*classes, build_args=build_args):
ExperimentalGlobDispatcher,
build_args={**build_args, "base_write": UnidistIO.to_xml},
)
read_pickle_distributed = __make_read(
read_pickle_glob = __make_read(
ExperimentalPandasPickleParser, ExperimentalGlobDispatcher
)
to_pickle_distributed = __make_write(
to_pickle_glob = __make_write(
ExperimentalGlobDispatcher,
build_args={**build_args, "base_write": UnidistIO.to_pickle},
)
Expand Down
10 changes: 9 additions & 1 deletion modin/experimental/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,21 @@
"""

from modin.pandas import * # noqa F401, F403
from modin.utils import func_from_deprecated_location

from .io import ( # noqa F401
read_csv_glob,
read_custom_text,
read_json_glob,
read_parquet_glob,
read_pickle_distributed,
read_pickle_glob,
read_sql,
read_xml_glob,
)

read_pickle_distributed = func_from_deprecated_location(
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
"read_pickle_glob",
"modin.experimental.pandas.io",
"`modin.experimental.pandas.read_pickle_distributed` is deprecated and will be removed in a future version. "
+ "Please use `modin.experimental.pandas.to_pickle_glob` instead.",
)
10 changes: 5 additions & 5 deletions modin/experimental/pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def _read(**kwargs) -> DataFrame:


@expanduser_path_arg("filepath_or_buffer")
def read_pickle_distributed(
def read_pickle_glob(
filepath_or_buffer,
compression: Optional[str] = "infer",
storage_options: StorageOptions = None,
Expand All @@ -313,7 +313,7 @@ def read_pickle_distributed(

This experimental feature provides parallel reading from multiple pickle files which are
defined by glob pattern. The files must contain parts of one dataframe, which can be
obtained, for example, by `DataFrame.modin.to_pickle_distributed` function.
obtained, for example, by `DataFrame.modin.to_pickle_glob` function.

Parameters
----------
Expand Down Expand Up @@ -344,11 +344,11 @@ def read_pickle_distributed(

from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

return DataFrame(query_compiler=FactoryDispatcher.read_pickle_distributed(**kwargs))
return DataFrame(query_compiler=FactoryDispatcher.read_pickle_glob(**kwargs))


@expanduser_path_arg("filepath_or_buffer")
def to_pickle_distributed(
def to_pickle_glob(
self,
filepath_or_buffer,
compression: CompressionOptions = "infer",
Expand Down Expand Up @@ -392,7 +392,7 @@ def to_pickle_distributed(

if isinstance(self, DataFrame):
obj = self._query_compiler
FactoryDispatcher.to_pickle_distributed(
FactoryDispatcher.to_pickle_glob(
obj,
filepath_or_buffer=filepath_or_buffer,
compression=compression,
Expand Down
12 changes: 7 additions & 5 deletions modin/experimental/pandas/test/test_io_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,11 @@ def _pandas_read_csv_glob(path, storage_options):
@pytest.mark.parametrize(
"filename", ["test_default_to_pickle.pkl", "test_to_pickle*.pkl"]
)
def test_distributed_pickling(tmp_path, filename, compression, pathlike):
@pytest.mark.parametrize("read_func", ["read_pickle_glob", "read_pickle_distributed"])
@pytest.mark.parametrize("to_func", ["to_pickle_glob", "to_pickle_distributed"])
def test_distributed_pickling(
tmp_path, filename, compression, pathlike, read_func, to_func
):
data = test_data["int_data"]
df = pd.DataFrame(data)

Expand All @@ -264,10 +268,8 @@ def test_distributed_pickling(tmp_path, filename, compression, pathlike):
if filename_param == "test_default_to_pickle.pkl"
else contextlib.nullcontext()
):
df.modin.to_pickle_distributed(
str(tmp_path / filename), compression=compression
)
pickled_df = pd.read_pickle_distributed(
getattr(df.modin, to_func)(str(tmp_path / filename), compression=compression)
pickled_df = getattr(pd, read_func)(
str(tmp_path / filename), compression=compression
)
df_equals(pickled_df, df)
Expand Down
23 changes: 20 additions & 3 deletions modin/pandas/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"""

import pickle
import warnings

import pandas
from pandas._typing import CompressionOptions, StorageOptions
Expand Down Expand Up @@ -209,7 +210,7 @@ class ExperimentalFunctions:
def __init__(self, data):
self._data = data

def to_pickle_distributed(
def to_pickle_glob(
self,
filepath_or_buffer,
compression: CompressionOptions = "infer",
Expand Down Expand Up @@ -248,16 +249,32 @@ def to_pickle_distributed(
this argument with a non-fsspec URL. See the fsspec and backend storage
implementation docs for the set of allowed keys and values.
"""
from modin.experimental.pandas.io import to_pickle_distributed
from modin.experimental.pandas.io import to_pickle_glob

to_pickle_distributed(
to_pickle_glob(
self._data,
filepath_or_buffer=filepath_or_buffer,
compression=compression,
protocol=protocol,
storage_options=storage_options,
)

def to_pickle_distributed(
self,
filepath_or_buffer,
compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: StorageOptions = None,
) -> None: # noqa
warnings.warn(
"`DataFrame.modin.to_pickle_distributed` is deprecated and will be removed in a future version. "
+ "Please use `DataFrame.modin.to_pickle_glob` instead.",
category=FutureWarning,
)
return self.to_pickle_glob(
filepath_or_buffer, compression, protocol, storage_options
)

def to_parquet_glob(
self,
path,
Expand Down
Loading