Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#6906: Update to pandas 2.2.* #6907

Merged
merged 27 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/supported_apis/utilities_supported.rst
Expand Up @@ -98,8 +98,6 @@ contributing a distributed version of any of these objects, feel free to open a
* DateOffset
* ExcelWriter
* SparseArray
* SparseSeries
* SparseDataFrame

.. _open an issue: https://github.com/modin-project/modin/issues
.. _pull request: https://github.com/modin-project/modin/pulls
Expand Down
29 changes: 14 additions & 15 deletions environment-dev.yml
Expand Up @@ -5,9 +5,9 @@ dependencies:
- pip

# required dependencies
- pandas>=2.1,<2.2
- pandas>=2.2,<2.3
- numpy>=1.22.4
- fsspec>=2022.05.0
- fsspec>=2022.11.0
- packaging>=21.0
- psutil>=5.8.0

Expand All @@ -20,21 +20,21 @@ dependencies:
- grpcio!=1.46.*
- dask>=2.22.0
- distributed>=2.22.0
- xarray>=2022.03.0
- xarray>=2022.12.0
- jinja2>=3.1.2
- scipy>=1.8.1
- s3fs>=2022.05.0
- lxml>=4.8.0
- openpyxl>=3.0.10
- scipy>=1.10.0
- s3fs>=2022.11.0
- lxml>=4.9.2
- openpyxl>=3.1.0
- xlrd>=2.0.1
- matplotlib>=3.6.1
- sqlalchemy>=1.4.0,<1.4.46
- pandas-gbq>=0.15.0
- pytables>=3.7.0
- matplotlib>=3.6.3
- sqlalchemy>=2.0.0
- pandas-gbq>=0.19.0
- pytables>=3.8.0
# pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429
- pymssql>=2.1.5,!=2.2.8
- psycopg2>=2.9.3
- fastparquet>=0.8.1
- psycopg2>=2.9.6
- fastparquet>=2022.12.0
- tqdm>=4.60.0
# pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469
- numexpr<2.8.5
Expand Down Expand Up @@ -64,8 +64,7 @@ dependencies:
- asv==0.5.1
# no conda package for windows so we install it with pip
- connectorx>=0.2.6a4
# experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
- fuzzydata>=0.0.6
- fuzzydata>=0.0.11
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
Expand Down
@@ -1,4 +1,4 @@
fsspec>=2022.05.0
fsspec>=2022.11.0
jupyterlab
ipywidgets
modin[dask]
Expand Down
@@ -1,4 +1,4 @@
fsspec>=2022.05.0
fsspec>=2022.11.0
jupyterlab
ipywidgets
tqdm>=4.60.0
Expand Down
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
dependencies:
- pip
- fsspec>=2022.05.0
- fsspec>=2022.11.0
- jupyterlab
- ipywidgets
- modin-mpi
Expand Down
4 changes: 4 additions & 0 deletions modin/core/dataframe/algebra/default2pandas/__init__.py
Expand Up @@ -19,10 +19,12 @@
from .datetime import DateTimeDefault
from .default import DefaultMethod
from .groupby import GroupByDefault, SeriesGroupByDefault
from .list import ListDefault
from .resample import ResampleDefault
from .rolling import ExpandingDefault, RollingDefault
from .series import SeriesDefault
from .str import StrDefault
from .struct import StructDefault

__all__ = [
"DataFrameDefault",
Expand All @@ -37,4 +39,6 @@
"CatDefault",
"GroupByDefault",
"SeriesGroupByDefault",
"ListDefault",
"StructDefault",
]
35 changes: 35 additions & 0 deletions modin/core/dataframe/algebra/default2pandas/list.py
@@ -0,0 +1,35 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""Module houses default applied-on-list accessor functions builder class."""

from .series import SeriesDefault


class ListDefault(SeriesDefault):
"""Builder for default-to-pandas methods which is executed under list accessor."""

@classmethod
def frame_wrapper(cls, df):
"""
Get list accessor of the passed frame.

Parameters
----------
df : pandas.DataFrame

Returns
-------
pandas.core.arrays.arrow.ListAccessor
"""
return df.squeeze(axis=1).list
35 changes: 35 additions & 0 deletions modin/core/dataframe/algebra/default2pandas/struct.py
@@ -0,0 +1,35 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""Module houses default applied-on-struct accessor functions builder class."""

from .series import SeriesDefault


class StructDefault(SeriesDefault):
"""Builder for default-to-pandas methods which is executed under struct accessor."""

@classmethod
def frame_wrapper(cls, df):
"""
Get struct accessor of the passed frame.

Parameters
----------
df : pandas.DataFrame

Returns
-------
pandas.core.arrays.arrow.StructAccessor
"""
return df.squeeze(axis=1).struct
4 changes: 4 additions & 0 deletions modin/core/io/io.py
Expand Up @@ -478,6 +478,8 @@ def read_fwf(
widths=None,
infer_nrows=100,
dtype_backend=no_default,
iterator=False,
chunksize=None,
**kwds,
): # noqa: PR01
ErrorMessage.default_to_pandas("`read_fwf`")
Expand All @@ -487,6 +489,8 @@ def read_fwf(
widths=widths,
infer_nrows=infer_nrows,
dtype_backend=dtype_backend,
iterator=iterator,
chunksize=chunksize,
**kwds,
)
if isinstance(pd_obj, pandas.DataFrame):
Expand Down
91 changes: 91 additions & 0 deletions modin/core/storage_formats/base/query_compiler.py
Expand Up @@ -35,11 +35,13 @@
DateTimeDefault,
ExpandingDefault,
GroupByDefault,
ListDefault,
ResampleDefault,
RollingDefault,
SeriesDefault,
SeriesGroupByDefault,
StrDefault,
StructDefault,
)
from modin.error_message import ErrorMessage
from modin.logging import ClassLogger
Expand Down Expand Up @@ -6563,6 +6565,88 @@ def cat_codes(self):

# End of Categories methods

# List accessor's methods

@doc_utils.add_one_column_warning
@doc_utils.add_refer_to("Series.list.flatten")
def list_flatten(self):
"""
Flatten list values.

Returns
-------
BaseQueryCompiler
"""
return ListDefault.register(pandas.Series.list.flatten)(self)

@doc_utils.add_one_column_warning
@doc_utils.add_refer_to("Series.list.len")
def list_len(self):
"""
Return the length of each list in the Series.

Returns
-------
BaseQueryCompiler
"""
return ListDefault.register(pandas.Series.list.len)(self)

@doc_utils.add_one_column_warning
@doc_utils.add_refer_to("Series.list.__getitem__")
def list__getitem__(self, key): # noqa: PR01
"""
Index or slice lists in the Series.

Returns
-------
BaseQueryCompiler
"""
return ListDefault.register(pandas.Series.list.__getitem__)(self, key=key)

# End of List accessor's methods

# Struct accessor's methods

@doc_utils.add_one_column_warning
@doc_utils.add_refer_to("Series.struct.dtypes")
def struct_dtypes(self):
"""
Return the dtype object of each child field of the struct.

Returns
-------
BaseQueryCompiler
"""
return StructDefault.register(pandas.Series.struct.dtypes)(self)

@doc_utils.add_one_column_warning
@doc_utils.add_refer_to("Series.struct.field")
def struct_field(self, name_or_index): # noqa: PR01
"""
Extract a child field of a struct as a Series.

Returns
-------
BaseQueryCompiler
"""
return StructDefault.register(pandas.Series.struct.field)(
self, name_or_index=name_or_index
)

@doc_utils.add_one_column_warning
@doc_utils.add_refer_to("Series.struct.explode")
def struct_explode(self):
"""
Extract all child fields of a struct as a DataFrame.

Returns
-------
BaseQueryCompiler
"""
return StructDefault.register(pandas.Series.struct.explode)(self)

# End of Struct accessor's methods

# DataFrame methods

def invert(self):
Expand Down Expand Up @@ -6617,6 +6701,13 @@ def compare(self, other, align_axis, keep_shape, keep_equal, result_names):
result_names=result_names,
)

@doc_utils.add_refer_to("Series.case_when")
def case_when(self, caselist): # noqa: PR01, RT01, D200
"""
Replace values where the conditions are True.
"""
return SeriesDefault.register(pandas.Series.case_when)(self, caselist=caselist)

def repartition(self, axis=None):
"""
Repartitioning QueryCompiler objects to get ideal partitions inside.
Expand Down
Expand Up @@ -142,7 +142,10 @@ def get_col_names():
kwargs["filepath_or_buffer"], nrows=0, engine="c"
).columns.tolist()

if dtype := kwargs["dtype"]:
dtype = kwargs["dtype"]
# For details: https://github.com/pandas-dev/pandas/issues/57024
entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict)
if dtype:
if isinstance(dtype, dict):
column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
else:
Expand All @@ -151,7 +154,9 @@ def get_col_names():
else:
column_types = {}

if parse_dates := kwargs["parse_dates"]:
if parse_dates := (
None if entire_dataframe_dtype else kwargs["parse_dates"]
):
# Either list of column names or list of column indices is supported.
if isinstance(parse_dates, list) and (
all(isinstance(col, str) for col in parse_dates)
Expand Down Expand Up @@ -185,7 +190,7 @@ def get_col_names():
usecols_md = cls._prepare_pyarrow_usecols(kwargs)

po = ParseOptions(
delimiter="\\s+" if kwargs["delim_whitespace"] else delimiter,
delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we leave the previous code?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lib.no_default is equivalent for False value

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably mean the opposite. I see, we can leave the new change as is.

from pandas._libs import lib

a = lib.no_default

def f(a):
    if a:
        print("A")
    else:
        print("B")

f(a)
# A

Copy link
Collaborator

@YarShev YarShev Feb 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although, maybe we should make this condition (and all related ones) more explicit?

delimiter="\\s+" if kwargs["delim_whitespace"] and kwargs["delim_whitespace"] is not lib.no_default else delimiter

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably mean the opposite

Let me clarify. lib.no_default is equivalent for True value in if condition, but in pandas, for particular method it's equivalent for False value.

Although, maybe we should make this condition (and all related ones) more explicit?

It seems to me that this is an additional burden on the reader of the code.

quote_char=kwargs["quotechar"],
double_quote=kwargs["doublequote"],
escape_char=kwargs["escapechar"],
Expand Down Expand Up @@ -426,7 +431,7 @@ def _read_csv_check_support(
False,
f"read_csv with 'arrow' engine doesn't support {arg} parameter",
)
if delimiter is not None and read_csv_kwargs["delim_whitespace"]:
if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True:
raise ValueError(
"Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
)
Expand Down Expand Up @@ -541,7 +546,7 @@ def _validate_read_csv_kwargs(
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
if delim_whitespace is True and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
+ "delim_whitespace=True; you can only specify one."
Expand Down
Expand Up @@ -1440,6 +1440,10 @@ def applier(df, **kwargs):
# TODO: make sure we can ignore this warning
or "Frame contain columns with unsupported data-types"
in message
# Looks like the warning comes from pyarrow, more details:
# https://github.com/pandas-dev/pandas/pull/52419
or "Passing a BlockManager to DataFrame is deprecated"
in message
):
continue
assert (
Expand Down
6 changes: 3 additions & 3 deletions modin/experimental/core/io/sql/utils.py
Expand Up @@ -15,7 +15,7 @@

import pandas
import pandas._libs.lib as lib
from sqlalchemy import MetaData, Table, create_engine, inspect
from sqlalchemy import MetaData, Table, create_engine, inspect, text

from modin.core.storage_formats.pandas.parsers import _split_result_for_readers

Expand Down Expand Up @@ -167,9 +167,9 @@ def get_query_columns(engine, query):
Dictionary with columns names and python types.
"""
con = engine.connect()
result = con.execute(query).fetchone()
values = list(result)
result = con.execute(text(query))
YarShev marked this conversation as resolved.
Show resolved Hide resolved
cols_names = list(result.keys())
values = list(result.first())
cols = dict()
for i in range(len(cols_names)):
cols[cols_names[i]] = type(values[i]).__name__
Expand Down