DOCS-#3766: update 'read_csv_glob' dispatcher, parser and docstring (#…

…3797) Co-authored-by: Devin Petersohn <devin-petersohn@users.noreply.github.com> Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com> Co-authored-by: Alexey Prutskov <alexey.prutskov@intel.com> Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
modin-project · Jan 6, 2022 · be10ba9 · be10ba9
1 parent 70cc1d0
commit be10ba9
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 1 deletion.
diff --git a/modin/core/io/text/csv_glob_dispatcher.py b/modin/core/io/text/csv_glob_dispatcher.py
@@ -54,6 +54,15 @@ def _read(cls, filepath_or_buffer, **kwargs):
         # Ensures that the file is a string file path. Otherwise, default to pandas.
         filepath_or_buffer = cls.get_path_or_buffer(filepath_or_buffer)
         if isinstance(filepath_or_buffer, str):
+            # os.altsep == None on Linux
+            is_folder = any(
+                filepath_or_buffer.endswith(sep) for sep in (os.sep, os.altsep) if sep
+            )
+            if "*" not in filepath_or_buffer and not is_folder:
+                warnings.warn(
+                    "Shell-style wildcard '*' must be in the filename pattern in order to read multiple "
+                    f"files at once. Did you forget it? Passed filename: '{filepath_or_buffer}'"
+                )
             if not cls.file_exists(filepath_or_buffer):
                 return cls.single_worker_read(filepath_or_buffer, **kwargs)
             filepath_or_buffer = cls.get_path(filepath_or_buffer)

diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py
@@ -308,6 +308,10 @@ def parse(chunks, **kwargs):
         num_splits = kwargs.pop("num_splits", None)
         index_col = kwargs.get("index_col", None)
 
+        # `single_worker_read` just pass filename via chunks; need check
+        if isinstance(chunks, str):
+            return pandas.read_csv(chunks, **kwargs)
+
         # pop `compression` from kwargs because `bio` below is uncompressed
         compression = kwargs.pop("compression", "infer")
         storage_options = kwargs.pop("storage_options", None) or {}

diff --git a/modin/experimental/pandas/io.py b/modin/experimental/pandas/io.py
@@ -191,7 +191,7 @@ def _read(**kwargs) -> DataFrame:
     General documentation is available in `modin.pandas.read_csv`.
 
     This experimental feature provides parallel reading from multiple csv files which are
-    defined by glob pattern. Works for local files only!
+    defined by glob pattern.
 
     Parameters
     ----------
@@ -201,6 +201,27 @@ def _read(**kwargs) -> DataFrame:
     Returns
     -------
     modin.DataFrame
+
+    Examples
+    --------
+    >>> import modin.experimental.pandas as pd
+    >>> df = pd.read_csv_glob("s3://nyc-tlc/trip data/yellow_tripdata_2020-1*")
+    UserWarning: `read_*` implementation has mismatches with pandas:
+    Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.
+            VendorID tpep_pickup_datetime  ... total_amount  congestion_surcharge
+    0             1.0  2020-10-01 00:09:08  ...         4.30                   0.0
+    1             1.0  2020-10-01 00:09:19  ...        13.30                   2.5
+    2             1.0  2020-10-01 00:30:00  ...        15.36                   2.5
+    3             2.0  2020-10-01 00:56:46  ...        -3.80                   0.0
+    4             2.0  2020-10-01 00:56:46  ...         3.80                   0.0
+    ...           ...                  ...  ...          ...                   ...
+    4652008       NaN  2020-12-31 23:44:35  ...        43.95                   2.5
+    4652009       NaN  2020-12-31 23:41:36  ...        20.17                   2.5
+    4652010       NaN  2020-12-31 23:01:17  ...        78.98                   0.0
+    4652011       NaN  2020-12-31 23:31:29  ...        39.50                   0.0
+    4652012       NaN  2020-12-31 23:12:48  ...        20.64                   0.0
+
+    [4652013 rows x 18 columns]
     """
     Engine.subscribe(_update_engine)
 

diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py
@@ -112,6 +112,11 @@ def test_read_csv_empty_frame(self):
 
         df_equals(modin_df, pandas_df)
 
+    def test_read_csv_without_glob(self):
+        with pytest.warns(UserWarning, match=r"Shell-style wildcard"):
+            with pytest.raises(FileNotFoundError):
+                pd.read_csv_glob("s3://nyc-tlc/trip data/yellow_tripdata_2020-")
+
 
 @pytest.mark.skipif(
     Engine.get() != "Ray", reason="Currently only support Ray engine for glob paths."