modin-project · YarShev · Feb 9, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -98,8 +98,6 @@ contributing a distributed version of any of these objects, feel free to open a
 * DateOffset
 * ExcelWriter
 * SparseArray
-* SparseSeries
-* SparseDataFrame
 
 .. _open an issue: https://github.com/modin-project/modin/issues
 .. _pull request: https://github.com/modin-project/modin/pulls

@@ -5,9 +5,9 @@ dependencies:
   - pip
 
   # required dependencies
-  - pandas>=2.1,<2.2
+  - pandas>=2.2,<2.3
   - numpy>=1.22.4
-  - fsspec>=2022.05.0
+  - fsspec>=2022.11.0
   - packaging>=21.0
   - psutil>=5.8.0
 
@@ -20,21 +20,21 @@ dependencies:
   - grpcio!=1.46.*
   - dask>=2.22.0
   - distributed>=2.22.0
-  - xarray>=2022.03.0
+  - xarray>=2022.12.0
   - jinja2>=3.1.2
-  - scipy>=1.8.1
-  - s3fs>=2022.05.0
-  - lxml>=4.8.0
-  - openpyxl>=3.0.10
+  - scipy>=1.10.0
+  - s3fs>=2022.11.0
+  - lxml>=4.9.2
+  - openpyxl>=3.1.0
   - xlrd>=2.0.1
-  - matplotlib>=3.6.1
-  - sqlalchemy>=1.4.0,<1.4.46
-  - pandas-gbq>=0.15.0
-  - pytables>=3.7.0
+  - matplotlib>=3.6.3
+  - sqlalchemy>=2.0.0
+  - pandas-gbq>=0.19.0
+  - pytables>=3.8.0
   # pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429
   - pymssql>=2.1.5,!=2.2.8
-  - psycopg2>=2.9.3
-  - fastparquet>=0.8.1
+  - psycopg2>=2.9.6
+  - fastparquet>=2022.12.0
   - tqdm>=4.60.0
   # pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469
   - numexpr<2.8.5
@@ -64,8 +64,7 @@ dependencies:
       - asv==0.5.1
       # no conda package for windows so we install it with pip
       - connectorx>=0.2.6a4
-      # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
-      - fuzzydata>=0.0.6
+      - fuzzydata>=0.0.11
       # Fixes breaking ipywidgets changes, but didn't release yet.
       - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
       # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.

@@ -1,4 +1,4 @@
-fsspec>=2022.05.0
+fsspec>=2022.11.0
 jupyterlab
 ipywidgets
 modin[dask]

@@ -1,4 +1,4 @@
-fsspec>=2022.05.0
+fsspec>=2022.11.0
 jupyterlab
 ipywidgets
 tqdm>=4.60.0

@@ -3,7 +3,7 @@ channels:
   - conda-forge
 dependencies:
   - pip
-  - fsspec>=2022.05.0
+  - fsspec>=2022.11.0
   - jupyterlab
   - ipywidgets
   - modin-mpi

@@ -19,10 +19,12 @@
 from .datetime import DateTimeDefault
 from .default import DefaultMethod
 from .groupby import GroupByDefault, SeriesGroupByDefault
+from .list import ListDefault
 from .resample import ResampleDefault
 from .rolling import ExpandingDefault, RollingDefault
 from .series import SeriesDefault
 from .str import StrDefault
+from .struct import StructDefault
 
 __all__ = [
     "DataFrameDefault",
@@ -37,4 +39,6 @@
     "CatDefault",
     "GroupByDefault",
     "SeriesGroupByDefault",
+    "ListDefault",
+    "StructDefault",
 ]
@@ -0,0 +1,35 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+"""Module houses default applied-on-list accessor functions builder class."""
+
+from .series import SeriesDefault
+
+
+class ListDefault(SeriesDefault):
+    """Builder for default-to-pandas methods which is executed under list accessor."""
+
+    @classmethod
+    def frame_wrapper(cls, df):
+        """
+        Get list accessor of the passed frame.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+
+        Returns
+        -------
+        pandas.core.arrays.arrow.ListAccessor
+        """
+        return df.squeeze(axis=1).list
@@ -0,0 +1,35 @@
+# Licensed to Modin Development Team under one or more contributor license agreements.
+# See the NOTICE file distributed with this work for additional information regarding
+# copyright ownership.  The Modin Development Team licenses this file to you under the
+# Apache License, Version 2.0 (the "License"); you may not use this file except in
+# compliance with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under
+# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific language
+# governing permissions and limitations under the License.
+
+"""Module houses default applied-on-struct accessor functions builder class."""
+
+from .series import SeriesDefault
+
+
+class StructDefault(SeriesDefault):
+    """Builder for default-to-pandas methods which is executed under struct accessor."""
+
+    @classmethod
+    def frame_wrapper(cls, df):
+        """
+        Get struct accessor of the passed frame.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+
+        Returns
+        -------
+        pandas.core.arrays.arrow.StructAccessor
+        """
+        return df.squeeze(axis=1).struct
@@ -478,6 +478,8 @@ def read_fwf(
         widths=None,
         infer_nrows=100,
         dtype_backend=no_default,
+        iterator=False,
+        chunksize=None,
         **kwds,
     ):  # noqa: PR01
         ErrorMessage.default_to_pandas("`read_fwf`")
@@ -487,6 +489,8 @@ def read_fwf(
             widths=widths,
             infer_nrows=infer_nrows,
             dtype_backend=dtype_backend,
+            iterator=iterator,
+            chunksize=chunksize,
             **kwds,
         )
         if isinstance(pd_obj, pandas.DataFrame):

@@ -35,11 +35,13 @@
     DateTimeDefault,
     ExpandingDefault,
     GroupByDefault,
+    ListDefault,
     ResampleDefault,
     RollingDefault,
     SeriesDefault,
     SeriesGroupByDefault,
     StrDefault,
+    StructDefault,
 )
 from modin.error_message import ErrorMessage
 from modin.logging import ClassLogger
@@ -6563,6 +6565,88 @@ def cat_codes(self):
 
     # End of Categories methods
 
+    # List accessor's methods
+
+    @doc_utils.add_one_column_warning
+    @doc_utils.add_refer_to("Series.list.flatten")
+    def list_flatten(self):
+        """
+        Flatten list values.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        return ListDefault.register(pandas.Series.list.flatten)(self)
+
+    @doc_utils.add_one_column_warning
+    @doc_utils.add_refer_to("Series.list.len")
+    def list_len(self):
+        """
+        Return the length of each list in the Series.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        return ListDefault.register(pandas.Series.list.len)(self)
+
+    @doc_utils.add_one_column_warning
+    @doc_utils.add_refer_to("Series.list.__getitem__")
+    def list__getitem__(self, key):  # noqa: PR01
+        """
+        Index or slice lists in the Series.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        return ListDefault.register(pandas.Series.list.__getitem__)(self, key=key)
+
+    # End of List accessor's methods
+
+    # Struct accessor's methods
+
+    @doc_utils.add_one_column_warning
+    @doc_utils.add_refer_to("Series.struct.dtypes")
+    def struct_dtypes(self):
+        """
+        Return the dtype object of each child field of the struct.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        return StructDefault.register(pandas.Series.struct.dtypes)(self)
+
+    @doc_utils.add_one_column_warning
+    @doc_utils.add_refer_to("Series.struct.field")
+    def struct_field(self, name_or_index):  # noqa: PR01
+        """
+        Extract a child field of a struct as a Series.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        return StructDefault.register(pandas.Series.struct.field)(
+            self, name_or_index=name_or_index
+        )
+
+    @doc_utils.add_one_column_warning
+    @doc_utils.add_refer_to("Series.struct.explode")
+    def struct_explode(self):
+        """
+        Extract all child fields of a struct as a DataFrame.
+
+        Returns
+        -------
+        BaseQueryCompiler
+        """
+        return StructDefault.register(pandas.Series.struct.explode)(self)
+
+    # End of Struct accessor's methods
+
     # DataFrame methods
 
     def invert(self):
@@ -6617,6 +6701,13 @@ def compare(self, other, align_axis, keep_shape, keep_equal, result_names):
             result_names=result_names,
         )
 
+    @doc_utils.add_refer_to("Series.case_when")
+    def case_when(self, caselist):  # noqa: PR01, RT01, D200
+        """
+        Replace values where the conditions are True.
+        """
+        return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)
+
     def repartition(self, axis=None):
         """
         Repartitioning QueryCompiler objects to get ideal partitions inside.

@@ -142,7 +142,10 @@ def get_col_names():
                     kwargs["filepath_or_buffer"], nrows=0, engine="c"
                 ).columns.tolist()
 
-            if dtype := kwargs["dtype"]:
+            dtype = kwargs["dtype"]
+            # For details: https://github.com/pandas-dev/pandas/issues/57024
+            entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict)
+            if dtype:
                 if isinstance(dtype, dict):
                     column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
                 else:
@@ -151,7 +154,9 @@ def get_col_names():
             else:
                 column_types = {}
 
-            if parse_dates := kwargs["parse_dates"]:
+            if parse_dates := (
+                None if entire_dataframe_dtype else kwargs["parse_dates"]
+            ):
                 # Either list of column names or list of column indices is supported.
                 if isinstance(parse_dates, list) and (
                     all(isinstance(col, str) for col in parse_dates)
@@ -185,7 +190,7 @@ def get_col_names():
             usecols_md = cls._prepare_pyarrow_usecols(kwargs)
 
             po = ParseOptions(
-                delimiter="\\s+" if kwargs["delim_whitespace"] else delimiter,
+                delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter,
                 quote_char=kwargs["quotechar"],
                 double_quote=kwargs["doublequote"],
                 escape_char=kwargs["escapechar"],
@@ -426,7 +431,7 @@ def _read_csv_check_support(
                     False,
                     f"read_csv with 'arrow' engine doesn't support {arg} parameter",
                 )
-        if delimiter is not None and read_csv_kwargs["delim_whitespace"]:
+        if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True:
             raise ValueError(
                 "Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
             )
@@ -541,7 +546,7 @@ def _validate_read_csv_kwargs(
         if delimiter is None:
             delimiter = sep
 
-        if delim_whitespace and (delimiter is not lib.no_default):
+        if delim_whitespace is True and (delimiter is not lib.no_default):
             raise ValueError(
                 "Specified a delimiter with both sep and "
                 + "delim_whitespace=True; you can only specify one."

@@ -1440,6 +1440,10 @@ def applier(df, **kwargs):
                         # TODO: make sure we can ignore this warning
                         or "Frame contain columns with unsupported data-types"
                         in message
+                        # Looks like the warning comes from pyarrow, more details:
+                        # https://github.com/pandas-dev/pandas/pull/52419
+                        or "Passing a BlockManager to DataFrame is deprecated"
+                        in message
                     ):
                         continue
                     assert (

@@ -15,7 +15,7 @@
 
 import pandas
 import pandas._libs.lib as lib
-from sqlalchemy import MetaData, Table, create_engine, inspect
+from sqlalchemy import MetaData, Table, create_engine, inspect, text
 
 from modin.core.storage_formats.pandas.parsers import _split_result_for_readers
 
@@ -167,9 +167,9 @@ def get_query_columns(engine, query):
         Dictionary with columns names and python types.
     """
     con = engine.connect()
-    result = con.execute(query).fetchone()
-    values = list(result)
+    result = con.execute(text(query))
     cols_names = list(result.keys())
+    values = list(result.first())
     cols = dict()
     for i in range(len(cols_names)):
         cols[cols_names[i]] = type(values[i]).__name__