modin-project · YarShev · Feb 9, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -98,8 +98,6 @@ contributing a distributed version of any of these objects, feel free to open a
 * DateOffset
 * ExcelWriter
 * SparseArray
-* SparseSeries
-* SparseDataFrame
 
 .. _open an issue: https://github.com/modin-project/modin/issues
 .. _pull request: https://github.com/modin-project/modin/pulls

@@ -5,9 +5,9 @@ dependencies:
   - pip
 
   # required dependencies
-  - pandas>=2.1,<2.2
+  - pandas>=2.2,<2.3
   - numpy>=1.22.4
-  - fsspec>=2022.05.0
+  - fsspec>=2022.11.0
   - packaging>=21.0
   - psutil>=5.8.0
 
@@ -20,21 +20,21 @@ dependencies:
   - grpcio!=1.46.*
   - dask>=2.22.0
   - distributed>=2.22.0
-  - xarray>=2022.03.0
+  - xarray>=2022.12.0
   - jinja2>=3.1.2
-  - scipy>=1.8.1
-  - s3fs>=2022.05.0
-  - lxml>=4.8.0
-  - openpyxl>=3.0.10
+  - scipy>=1.10.0
+  - s3fs>=2022.11.0
+  - lxml>=4.9.2
+  - openpyxl>=3.1.0
   - xlrd>=2.0.1
-  - matplotlib>=3.6.1
-  - sqlalchemy>=1.4.0,<1.4.46
-  - pandas-gbq>=0.15.0
-  - pytables>=3.7.0
+  - matplotlib>=3.6.3
+  - sqlalchemy>=2.0.0
+  - pandas-gbq>=0.19.0
+  - pytables>=3.8.0
   # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429
   - pymssql>=2.1.5,!=2.2.8
-  - psycopg2>=2.9.3
-  - fastparquet>=0.8.1
+  - psycopg2>=2.9.6
+  - fastparquet>=2022.12.0
   - tqdm>=4.60.0
   # pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469
   - numexpr<2.8.5
@@ -65,7 +65,7 @@ dependencies:
       # no conda package for windows so we install it with pip
       - connectorx>=0.2.6a4
       # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
-      - fuzzydata>=0.0.6
+      # - fuzzydata>=0.0.6
       # Fixes breaking ipywidgets changes, but didn't release yet.
       - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
       # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.

@@ -1,4 +1,4 @@
-fsspec>=2022.05.0
+fsspec>=2022.11.0
 jupyterlab
 ipywidgets
 modin[dask]

@@ -1,4 +1,4 @@
-fsspec>=2022.05.0
+fsspec>=2022.11.0
 jupyterlab
 ipywidgets
 tqdm>=4.60.0

@@ -3,7 +3,7 @@ channels:
   - conda-forge
 dependencies:
   - pip
-  - fsspec>=2022.05.0
+  - fsspec>=2022.11.0
   - jupyterlab
   - ipywidgets
   - modin-mpi

@@ -142,7 +142,10 @@ def get_col_names():
                     kwargs["filepath_or_buffer"], nrows=0, engine="c"
                 ).columns.tolist()
 
-            if dtype := kwargs["dtype"]:
+            dtype = kwargs["dtype"]
+            # For details: https://github.com/pandas-dev/pandas/issues/57024
+            entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict)
+            if dtype:
                 if isinstance(dtype, dict):
                     column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
                 else:
@@ -151,7 +154,9 @@ def get_col_names():
             else:
                 column_types = {}
 
-            if parse_dates := kwargs["parse_dates"]:
+            if parse_dates := (
+                None if entire_dataframe_dtype else kwargs["parse_dates"]
+            ):
                 # Either list of column names or list of column indices is supported.
                 if isinstance(parse_dates, list) and (
                     all(isinstance(col, str) for col in parse_dates)
@@ -185,7 +190,7 @@ def get_col_names():
             usecols_md = cls._prepare_pyarrow_usecols(kwargs)
 
             po = ParseOptions(
-                delimiter="\\s+" if kwargs["delim_whitespace"] else delimiter,
+                delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter,
                 quote_char=kwargs["quotechar"],
                 double_quote=kwargs["doublequote"],
                 escape_char=kwargs["escapechar"],
@@ -426,7 +431,7 @@ def _read_csv_check_support(
                     False,
                     f"read_csv with 'arrow' engine doesn't support {arg} parameter",
                 )
-        if delimiter is not None and read_csv_kwargs["delim_whitespace"]:
+        if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True:
             raise ValueError(
                 "Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
             )
@@ -541,7 +546,7 @@ def _validate_read_csv_kwargs(
         if delimiter is None:
             delimiter = sep
 
-        if delim_whitespace and (delimiter is not lib.no_default):
+        if delim_whitespace is True and (delimiter is not lib.no_default):
             raise ValueError(
                 "Specified a delimiter with both sep and "
                 + "delim_whitespace=True; you can only specify one."

@@ -1440,6 +1440,10 @@ def applier(df, **kwargs):
                         # TODO: make sure we can ignore this warning
                         or "Frame contain columns with unsupported data-types"
                         in message
+                        # Looks like the warning comes from pyarrow, more details:
+                        # https://github.com/pandas-dev/pandas/pull/52419
+                        or "Passing a BlockManager to DataFrame is deprecated"
+                        in message
                     ):
                         continue
                     assert (

@@ -15,7 +15,7 @@
 
 import pandas
 import pandas._libs.lib as lib
-from sqlalchemy import MetaData, Table, create_engine, inspect
+from sqlalchemy import MetaData, Table, create_engine, inspect, text
 
 from modin.core.storage_formats.pandas.parsers import _split_result_for_readers
 
@@ -167,9 +167,9 @@ def get_query_columns(engine, query):
         Dictionary with columns names and python types.
     """
     con = engine.connect()
-    result = con.execute(query).fetchone()
-    values = list(result)
+    result = con.execute(text(query))
     cols_names = list(result.keys())
+    values = list(result.first())
     cols = dict()
     for i in range(len(cols_names)):
         cols[cols_names[i]] = type(values[i]).__name__

@@ -201,11 +201,11 @@ def parser_func(
         na_values=None,
         keep_default_na=True,
         na_filter=True,
-        verbose=False,
+        verbose=lib.no_default,
         skip_blank_lines=True,
         parse_dates=None,
         infer_datetime_format=lib.no_default,
-        keep_date_col=False,
+        keep_date_col=lib.no_default,
         date_parser=lib.no_default,
         date_format=None,
         dayfirst=False,
@@ -225,7 +225,7 @@ def parser_func(
         dialect=None,
         on_bad_lines="error",
         doublequote=True,
-        delim_whitespace=False,
+        delim_whitespace=lib.no_default,
         low_memory=True,
         memory_map=False,
         float_precision=None,

@@ -16,7 +16,7 @@
 import pandas
 from packaging import version
 
-__pandas_version__ = "2.1"
+__pandas_version__ = "2.2"
 
 if (
     version.parse(pandas.__version__).release[:2]

@@ -520,15 +520,7 @@
                     failure_condition=True,
                     extra_log="{} is an unsupported operation".format(op),
                 )
-        # SparseDataFrames cannot be serialized by arrow and cause problems for Modin.
-        # For now we will use pandas.
-        if isinstance(result, type(self)) and not isinstance(
-            result, (pandas.SparseDataFrame, pandas.SparseSeries)
-        ):
-            return self._create_or_update_from_compiler(
-                result, inplace=kwargs.get("inplace", False)
-            )
-        elif isinstance(result, pandas.DataFrame):
+        if isinstance(result, pandas.DataFrame):
             from .dataframe import DataFrame
 
             return DataFrame(result)
@@ -1106,11 +1098,26 @@
         return downcast
 
     def bfill(
-        self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default
+        self,
+        *,
+        axis=None,
+        inplace=False,
+        limit=None,
+        limit_area=None,
+        downcast=lib.no_default,
     ):  # noqa: PR01, RT01, D200
         """
         Synonym for `DataFrame.fillna` with ``method='bfill'``.
         """
+        if limit_area is not None:
+            return self._default_to_pandas(
+                "bfill",
+                axis=axis,
+                inplace=inplace,
+                limit=limit,
+                limit_area=limit_area,
+                downcast=downcast,
+            )
         downcast = self._deprecate_downcast(downcast, "bfill")
         with warnings.catch_warnings():
             warnings.filterwarnings(
@@ -1599,11 +1606,26 @@
         )
 
     def ffill(
-        self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default
+        self,
+        *,
+        axis=None,
+        inplace=False,
+        limit=None,
+        limit_area=None,
+        downcast=lib.no_default,
     ):  # noqa: PR01, RT01, D200
         """
         Synonym for `DataFrame.fillna` with ``method='ffill'``.
         """
+        if limit_area is not None:
+            return self._default_to_pandas(
+                "ffill",
+                axis=axis,
+                inplace=inplace,
+                limit=limit,
+                limit_area=limit_area,
+                downcast=downcast,
+            )
         downcast = self._deprecate_downcast(downcast, "ffill")
         with warnings.catch_warnings():
             warnings.filterwarnings(
@@ -2489,8 +2511,8 @@
         axis: Axis = lib.no_default,
         closed: Optional[str] = None,
         label: Optional[str] = None,
-        convention: str = "start",
-        kind: Optional[str] = None,
+        convention: str = lib.no_default,
+        kind: Optional[str] = lib.no_default,
         on: Level = None,
         level: Level = None,
         origin: Union[str, TimestampConvertibleTypes] = "start_day",

@@ -394,12 +394,14 @@ def apply(
         result_type=None,
         args=(),
         by_row="compat",
+        engine="python",
+        engine_kwargs=None,
         **kwargs,
     ):  # noqa: PR01, RT01, D200
         """
         Apply a function along an axis of the ``DataFrame``.
         """
-        if by_row != "compat":
+        if by_row != "compat" or engine != "python":
             # TODO: add test
             return self._default_to_pandas(
                 pandas.DataFrame.apply,
@@ -409,6 +411,8 @@ def apply(
                 result_type=result_type,
                 args=args,
                 by_row=by_row,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
                 **kwargs,
             )
 
@@ -1446,7 +1450,7 @@ def pivot_table(
         margins=False,
         dropna=True,
         margins_name="All",
-        observed=False,
+        observed=lib.no_default,
         sort=True,
     ):  # noqa: PR01, RT01, D200
         """
@@ -1631,7 +1635,23 @@ def _get_axis_resolvers(self, axis: str) -> dict:
         d[axis] = dindex
         return d
 
-    _get_cleaned_column_resolvers = pandas.DataFrame._get_cleaned_column_resolvers
+    def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:  # noqa: RT01
+        """
+        Return the special character free column resolvers of a dataframe.
+
+        Column names with special characters are 'cleaned up' so that they can
+        be referred to by backtick quoting.
+        Used in `DataFrame.eval`.
+
+        Notes
+        -----
+        Copied from pandas.
+        """
+        from pandas.core.computation.parsing import clean_column_name
+
+        return {
+            clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
+        }
 
     def query(self, expr, inplace=False, **kwargs):  # noqa: PR01, RT01, D200
         """

@@ -230,7 +230,7 @@ def pivot_table(
     margins=False,
     dropna=True,
     margins_name="All",
-    observed=False,
+    observed=no_default,
     sort=True,
 ):
     if not isinstance(data, DataFrame):
@@ -247,6 +247,7 @@ def pivot_table(
         margins=margins,
         dropna=dropna,
         margins_name=margins_name,
+        observed=observed,
         sort=sort,
     )
 
@@ -492,18 +493,11 @@ def concat(
         raise ValueError(
             "Only can inner (intersect) or outer (union) join the other axis"
         )
-    # We have the weird Series and axis check because, when concatenating a
-    # dataframe to a series on axis=0, pandas ignores the name of the series,
-    # and this check aims to mirror that (possibly buggy) functionality
     list_of_objs = [
         (
             obj._query_compiler
             if isinstance(obj, DataFrame)
-            else (
-                DataFrame(obj.rename())._query_compiler
-                if isinstance(obj, (pandas.Series, Series)) and axis == 0
-                else DataFrame(obj)._query_compiler
-            )
+            else DataFrame(obj)._query_compiler
         )
         for obj in list_of_objs
     ]
@@ -627,7 +621,7 @@ def get_dummies(
     """
     if sparse:
         raise NotImplementedError(
-            "SparseDataFrame is not implemented. "
+            "SparseArray is not implemented. "
             + "To contribute to Modin, please visit "
             + "github.com/modin-project/modin."
         )