Skip to content

Commit

Permalink
DOCS-#3766: update 'read_csv_glob' dispatcher, parser and docstring (#…
Browse files Browse the repository at this point in the history
…3797)

Co-authored-by: Devin Petersohn <devin-petersohn@users.noreply.github.com>
Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com>
Co-authored-by: Alexey Prutskov <alexey.prutskov@intel.com>
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
4 people committed Jan 6, 2022
1 parent 70cc1d0 commit be10ba9
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 1 deletion.
9 changes: 9 additions & 0 deletions modin/core/io/text/csv_glob_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ def _read(cls, filepath_or_buffer, **kwargs):
# Ensures that the file is a string file path. Otherwise, default to pandas.
filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
if isinstance(filepath_or_buffer, str):
# os.altsep == None on Linux
is_folder = any(
filepath_or_buffer.endswith(sep) for sep in (os.sep, os.altsep) if sep
)
if "*" not in filepath_or_buffer and not is_folder:
warnings.warn(
"Shell-style wildcard '*' must be in the filename pattern in order to read multiple "
f"files at once. Did you forget it? Passed filename: '{filepath_or_buffer}'"
)
if not cls.file_exists(filepath_or_buffer):
return cls.single_worker_read(filepath_or_buffer, **kwargs)
filepath_or_buffer = cls.get_path(filepath_or_buffer)
Expand Down
4 changes: 4 additions & 0 deletions modin/core/storage_formats/pandas/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,10 @@ def parse(chunks, **kwargs):
num_splits = kwargs.pop("num_splits", None)
index_col = kwargs.get("index_col", None)

# `single_worker_read` just pass filename via chunks; need check
if isinstance(chunks, str):
return pandas.read_csv(chunks, **kwargs)

# pop `compression` from kwargs because `bio` below is uncompressed
compression = kwargs.pop("compression", "infer")
storage_options = kwargs.pop("storage_options", None) or {}
Expand Down
23 changes: 22 additions & 1 deletion modin/experimental/pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _read(**kwargs) -> DataFrame:
General documentation is available in `modin.pandas.read_csv`.
This experimental feature provides parallel reading from multiple csv files which are
defined by glob pattern. Works for local files only!
defined by glob pattern.
Parameters
----------
Expand All @@ -201,6 +201,27 @@ def _read(**kwargs) -> DataFrame:
Returns
-------
modin.DataFrame
Examples
--------
>>> import modin.experimental.pandas as pd
>>> df = pd.read_csv_glob("s3://nyc-tlc/trip data/yellow_tripdata_2020-1*")
UserWarning: `read_*` implementation has mismatches with pandas:
Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.
VendorID tpep_pickup_datetime ... total_amount congestion_surcharge
0 1.0 2020-10-01 00:09:08 ... 4.30 0.0
1 1.0 2020-10-01 00:09:19 ... 13.30 2.5
2 1.0 2020-10-01 00:30:00 ... 15.36 2.5
3 2.0 2020-10-01 00:56:46 ... -3.80 0.0
4 2.0 2020-10-01 00:56:46 ... 3.80 0.0
... ... ... ... ... ...
4652008 NaN 2020-12-31 23:44:35 ... 43.95 2.5
4652009 NaN 2020-12-31 23:41:36 ... 20.17 2.5
4652010 NaN 2020-12-31 23:01:17 ... 78.98 0.0
4652011 NaN 2020-12-31 23:31:29 ... 39.50 0.0
4652012 NaN 2020-12-31 23:12:48 ... 20.64 0.0
[4652013 rows x 18 columns]
"""
Engine.subscribe(_update_engine)

Expand Down
5 changes: 5 additions & 0 deletions modin/experimental/pandas/test/test_io_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def test_read_csv_empty_frame(self):

df_equals(modin_df, pandas_df)

def test_read_csv_without_glob(self):
with pytest.warns(UserWarning, match=r"Shell-style wildcard"):
with pytest.raises(FileNotFoundError):
pd.read_csv_glob("s3://nyc-tlc/trip data/yellow_tripdata_2020-")


@pytest.mark.skipif(
Engine.get() != "Ray", reason="Currently only support Ray engine for glob paths."
Expand Down

0 comments on commit be10ba9

Please sign in to comment.