modin-project · YarShev · Feb 9, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -98,8 +98,6 @@ contributing a distributed version of any of these objects, feel free to open a
 * DateOffset
 * ExcelWriter
 * SparseArray
-* SparseSeries
-* SparseDataFrame
 
 .. _open an issue: https://github.com/modin-project/modin/issues
 .. _pull request: https://github.com/modin-project/modin/pulls

@@ -5,9 +5,9 @@ dependencies:
   - pip
 
   # required dependencies
-  - pandas>=2.1,<2.2
+  - pandas>=2.2,<2.3
   - numpy>=1.22.4
-  - fsspec>=2022.05.0
+  - fsspec>=2022.11.0
   - packaging>=21.0
   - psutil>=5.8.0
 
@@ -20,21 +20,21 @@ dependencies:
   - grpcio!=1.46.*
   - dask>=2.22.0
   - distributed>=2.22.0
-  - xarray>=2022.03.0
+  - xarray>=2022.12.0
   - jinja2>=3.1.2
-  - scipy>=1.8.1
-  - s3fs>=2022.05.0
-  - lxml>=4.8.0
-  - openpyxl>=3.0.10
+  - scipy>=1.10.0
+  - s3fs>=2022.11.0
+  - lxml>=4.9.2
+  - openpyxl>=3.1.0
   - xlrd>=2.0.1
-  - matplotlib>=3.6.1
-  - sqlalchemy>=1.4.0,<1.4.46
-  - pandas-gbq>=0.15.0
-  - pytables>=3.7.0
+  - matplotlib>=3.6.3
+  - sqlalchemy>=2.0.0
+  - pandas-gbq>=0.19.0
+  - pytables>=3.8.0
   # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429
   - pymssql>=2.1.5,!=2.2.8
-  - psycopg2>=2.9.3
-  - fastparquet>=0.8.1
+  - psycopg2>=2.9.6
+  - fastparquet>=2022.12.0
   - tqdm>=4.60.0
   # pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469
   - numexpr<2.8.5
@@ -64,8 +64,7 @@ dependencies:
       - asv==0.5.1
       # no conda package for windows so we install it with pip
       - connectorx>=0.2.6a4
-      # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
-      - fuzzydata>=0.0.6
+      - fuzzydata>=0.0.11
       # Fixes breaking ipywidgets changes, but didn't release yet.
       - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
       # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.

@@ -1,4 +1,4 @@
-fsspec>=2022.05.0
+fsspec>=2022.11.0
 jupyterlab
 ipywidgets
 modin[dask]

@@ -1,4 +1,4 @@
-fsspec>=2022.05.0
+fsspec>=2022.11.0
 jupyterlab
 ipywidgets
 tqdm>=4.60.0

@@ -3,7 +3,7 @@ channels:
   - conda-forge
 dependencies:
   - pip
-  - fsspec>=2022.05.0
+  - fsspec>=2022.11.0
   - jupyterlab
   - ipywidgets
   - modin-mpi

@@ -478,6 +478,8 @@ def read_fwf(
         widths=None,
         infer_nrows=100,
         dtype_backend=no_default,
+        iterator=False,
+        chunksize=None,
         **kwds,
     ):  # noqa: PR01
         ErrorMessage.default_to_pandas("`read_fwf`")
@@ -487,6 +489,8 @@ def read_fwf(
             widths=widths,
             infer_nrows=infer_nrows,
             dtype_backend=dtype_backend,
+            iterator=iterator,
+            chunksize=chunksize,
             **kwds,
         )
         if isinstance(pd_obj, pandas.DataFrame):

@@ -142,7 +142,10 @@ def get_col_names():
                     kwargs["filepath_or_buffer"], nrows=0, engine="c"
                 ).columns.tolist()
 
-            if dtype := kwargs["dtype"]:
+            dtype = kwargs["dtype"]
+            # For details: https://github.com/pandas-dev/pandas/issues/57024
+            entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict)
+            if dtype:
                 if isinstance(dtype, dict):
                     column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
                 else:
@@ -151,7 +154,9 @@ def get_col_names():
             else:
                 column_types = {}
 
-            if parse_dates := kwargs["parse_dates"]:
+            if parse_dates := (
+                None if entire_dataframe_dtype else kwargs["parse_dates"]
+            ):
                 # Either list of column names or list of column indices is supported.
                 if isinstance(parse_dates, list) and (
                     all(isinstance(col, str) for col in parse_dates)
@@ -185,7 +190,7 @@ def get_col_names():
             usecols_md = cls._prepare_pyarrow_usecols(kwargs)
 
             po = ParseOptions(
-                delimiter="\\s+" if kwargs["delim_whitespace"] else delimiter,
+                delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter,
                 quote_char=kwargs["quotechar"],
                 double_quote=kwargs["doublequote"],
                 escape_char=kwargs["escapechar"],
@@ -426,7 +431,7 @@ def _read_csv_check_support(
                     False,
                     f"read_csv with 'arrow' engine doesn't support {arg} parameter",
                 )
-        if delimiter is not None and read_csv_kwargs["delim_whitespace"]:
+        if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True:
             raise ValueError(
                 "Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
             )
@@ -541,7 +546,7 @@ def _validate_read_csv_kwargs(
         if delimiter is None:
             delimiter = sep
 
-        if delim_whitespace and (delimiter is not lib.no_default):
+        if delim_whitespace is True and (delimiter is not lib.no_default):
             raise ValueError(
                 "Specified a delimiter with both sep and "
                 + "delim_whitespace=True; you can only specify one."

@@ -1440,6 +1440,10 @@ def applier(df, **kwargs):
                         # TODO: make sure we can ignore this warning
                         or "Frame contain columns with unsupported data-types"
                         in message
+                        # Looks like the warning comes from pyarrow, more details:
+                        # https://github.com/pandas-dev/pandas/pull/52419
+                        or "Passing a BlockManager to DataFrame is deprecated"
+                        in message
                     ):
                         continue
                     assert (

@@ -15,7 +15,7 @@
 
 import pandas
 import pandas._libs.lib as lib
-from sqlalchemy import MetaData, Table, create_engine, inspect
+from sqlalchemy import MetaData, Table, create_engine, inspect, text
 
 from modin.core.storage_formats.pandas.parsers import _split_result_for_readers
 
@@ -167,9 +167,9 @@ def get_query_columns(engine, query):
         Dictionary with columns names and python types.
     """
     con = engine.connect()
-    result = con.execute(query).fetchone()
-    values = list(result)
+    result = con.execute(text(query))
     cols_names = list(result.keys())
+    values = list(result.first())
     cols = dict()
     for i in range(len(cols_names)):
         cols[cols_names[i]] = type(values[i]).__name__

@@ -201,11 +201,11 @@ def parser_func(
         na_values=None,
         keep_default_na=True,
         na_filter=True,
-        verbose=False,
+        verbose=lib.no_default,
         skip_blank_lines=True,
         parse_dates=None,
         infer_datetime_format=lib.no_default,
-        keep_date_col=False,
+        keep_date_col=lib.no_default,
         date_parser=lib.no_default,
         date_format=None,
         dayfirst=False,
@@ -225,7 +225,7 @@ def parser_func(
         dialect=None,
         on_bad_lines="error",
         doublequote=True,
-        delim_whitespace=False,
+        delim_whitespace=lib.no_default,
         low_memory=True,
         memory_map=False,
         float_precision=None,

@@ -16,7 +16,7 @@
 import pandas
 from packaging import version
 
-__pandas_version__ = "2.1"
+__pandas_version__ = "2.2"
 
 if (
     version.parse(pandas.__version__).release[:2]

@@ -471,7 +471,7 @@ def _binary_op(self, op, other, **kwargs):
         new_query_compiler = getattr(self._query_compiler, op)(other, **kwargs)
         return self._create_or_update_from_compiler(new_query_compiler)
 
-    def _default_to_pandas(self, op, *args, **kwargs):
+    def _default_to_pandas(self, op, *args, reason: str = None, **kwargs):
         """
         Convert dataset to pandas type and call a pandas function on it.
 
@@ -481,6 +481,7 @@ def _default_to_pandas(self, op, *args, **kwargs):
             Name of pandas function.
         *args : list
             Additional positional arguments to be passed to `op`.
+        reason : str, optional
         **kwargs : dict
             Additional keywords arguments to be passed to `op`.
 
@@ -495,7 +496,8 @@ def _default_to_pandas(self, op, *args, **kwargs):
                 type(self).__name__,
                 op if isinstance(op, str) else op.__name__,
                 empty_self_str,
-            )
+            ),
+            reason=reason,
         )
 
         args = try_cast_to_pandas(args)
@@ -520,15 +522,7 @@ def _default_to_pandas(self, op, *args, **kwargs):
                     failure_condition=True,
                     extra_log="{} is an unsupported operation".format(op),
                 )
-        # SparseDataFrames cannot be serialized by arrow and cause problems for Modin.
-        # For now we will use pandas.
-        if isinstance(result, type(self)) and not isinstance(
-            result, (pandas.SparseDataFrame, pandas.SparseSeries)
-        ):
-            return self._create_or_update_from_compiler(
-                result, inplace=kwargs.get("inplace", False)
-            )
-        elif isinstance(result, pandas.DataFrame):
+        if isinstance(result, pandas.DataFrame):
             from .dataframe import DataFrame
 
             return DataFrame(result)
@@ -1106,11 +1100,27 @@ def _deprecate_downcast(self, downcast, method_name: str):
         return downcast
 
     def bfill(
-        self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default
+        self,
+        *,
+        axis=None,
+        inplace=False,
+        limit=None,
+        limit_area=None,
+        downcast=lib.no_default,
     ):  # noqa: PR01, RT01, D200
         """
         Synonym for `DataFrame.fillna` with ``method='bfill'``.
         """
+        if limit_area is not None:
+            return self._default_to_pandas(
+                "bfill",
+                reason="'limit_area' parameter isn't supported",
+                axis=axis,
+                inplace=inplace,
+                limit=limit,
+                limit_area=limit_area,
+                downcast=downcast,
+            )
         downcast = self._deprecate_downcast(downcast, "bfill")
         with warnings.catch_warnings():
             warnings.filterwarnings(
@@ -1599,11 +1609,27 @@ def expanding(
         )
 
     def ffill(
-        self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default
+        self,
+        *,
+        axis=None,
+        inplace=False,
+        limit=None,
+        limit_area=None,
+        downcast=lib.no_default,
     ):  # noqa: PR01, RT01, D200
         """
         Synonym for `DataFrame.fillna` with ``method='ffill'``.
         """
+        if limit_area is not None:
+            return self._default_to_pandas(
+                "ffill",
+                reason="'limit_area' parameter isn't supported",
+                axis=axis,
+                inplace=inplace,
+                limit=limit,
+                limit_area=limit_area,
+                downcast=downcast,
+            )
         downcast = self._deprecate_downcast(downcast, "ffill")
         with warnings.catch_warnings():
             warnings.filterwarnings(
@@ -2489,8 +2515,8 @@ def resample(
         axis: Axis = lib.no_default,
         closed: Optional[str] = None,
         label: Optional[str] = None,
-        convention: str = "start",
-        kind: Optional[str] = None,
+        convention: str = lib.no_default,
+        kind: Optional[str] = lib.no_default,
         on: Level = None,
         level: Level = None,
         origin: Union[str, TimestampConvertibleTypes] = "start_day",

@@ -394,12 +394,14 @@ def apply(
         result_type=None,
         args=(),
         by_row="compat",
+        engine="python",
+        engine_kwargs=None,
         **kwargs,
     ):  # noqa: PR01, RT01, D200
         """
         Apply a function along an axis of the ``DataFrame``.
         """
-        if by_row != "compat":
+        if by_row != "compat" or engine != "python" or engine_kwargs:
             # TODO: add test
             return self._default_to_pandas(
                 pandas.DataFrame.apply,
@@ -409,6 +411,8 @@ def apply(
                 result_type=result_type,
                 args=args,
                 by_row=by_row,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
                 **kwargs,
             )
 
@@ -1446,7 +1450,7 @@ def pivot_table(
         margins=False,
         dropna=True,
         margins_name="All",
-        observed=False,
+        observed=lib.no_default,
         sort=True,
     ):  # noqa: PR01, RT01, D200
         """
@@ -1631,7 +1635,23 @@ def _get_axis_resolvers(self, axis: str) -> dict:
         d[axis] = dindex
         return d
 
-    _get_cleaned_column_resolvers = pandas.DataFrame._get_cleaned_column_resolvers
+    def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:  # noqa: RT01
+        """
+        Return the special character free column resolvers of a dataframe.
+
+        Column names with special characters are 'cleaned up' so that they can
+        be referred to by backtick quoting.
+        Used in `DataFrame.eval`.
+
+        Notes
+        -----
+        Copied from pandas.
+        """
+        from pandas.core.computation.parsing import clean_column_name
+
+        return {
+            clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
+        }
 
     def query(self, expr, inplace=False, **kwargs):  # noqa: PR01, RT01, D200
         """