Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#6906: Update to pandas 2.2.* #6907

Merged
merged 27 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/supported_apis/utilities_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ contributing a distributed version of any of these objects, feel free to open a
* DateOffset
* ExcelWriter
* SparseArray
* SparseSeries
* SparseDataFrame

.. _open an issue: https://github.com/modin-project/modin/issues
.. _pull request: https://github.com/modin-project/modin/pulls
Expand Down
29 changes: 14 additions & 15 deletions environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ dependencies:
- pip

# required dependencies
- pandas>=2.1,<2.2
- pandas>=2.2,<2.3
- numpy>=1.22.4
- fsspec>=2022.05.0
- fsspec>=2022.11.0
- packaging>=21.0
- psutil>=5.8.0

Expand All @@ -20,21 +20,21 @@ dependencies:
- grpcio!=1.46.*
- dask>=2.22.0
- distributed>=2.22.0
- xarray>=2022.03.0
- xarray>=2022.12.0
- jinja2>=3.1.2
- scipy>=1.8.1
- s3fs>=2022.05.0
- lxml>=4.8.0
- openpyxl>=3.0.10
- scipy>=1.10.0
- s3fs>=2022.11.0
- lxml>=4.9.2
- openpyxl>=3.1.0
- xlrd>=2.0.1
- matplotlib>=3.6.1
- sqlalchemy>=1.4.0,<1.4.46
- pandas-gbq>=0.15.0
- pytables>=3.7.0
- matplotlib>=3.6.3
- sqlalchemy>=2.0.0
- pandas-gbq>=0.19.0
- pytables>=3.8.0
# pymssql==2.2.8 broken: https://github.com/modin-project/modin/issues/6429
- pymssql>=2.1.5,!=2.2.8
- psycopg2>=2.9.3
- fastparquet>=0.8.1
- psycopg2>=2.9.6
- fastparquet>=2022.12.0
- tqdm>=4.60.0
# pandas isn't compatible with numexpr=2.8.5: https://github.com/modin-project/modin/issues/6469
- numexpr<2.8.5
Expand Down Expand Up @@ -64,8 +64,7 @@ dependencies:
- asv==0.5.1
# no conda package for windows so we install it with pip
- connectorx>=0.2.6a4
# experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
- fuzzydata>=0.0.6
- fuzzydata>=0.0.11
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
fsspec>=2022.05.0
fsspec>=2022.11.0
jupyterlab
ipywidgets
modin[dask]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
fsspec>=2022.05.0
fsspec>=2022.11.0
jupyterlab
ipywidgets
tqdm>=4.60.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
dependencies:
- pip
- fsspec>=2022.05.0
- fsspec>=2022.11.0
- jupyterlab
- ipywidgets
- modin-mpi
Expand Down
4 changes: 4 additions & 0 deletions modin/core/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,8 @@ def read_fwf(
widths=None,
infer_nrows=100,
dtype_backend=no_default,
iterator=False,
chunksize=None,
**kwds,
): # noqa: PR01
ErrorMessage.default_to_pandas("`read_fwf`")
Expand All @@ -487,6 +489,8 @@ def read_fwf(
widths=widths,
infer_nrows=infer_nrows,
dtype_backend=dtype_backend,
iterator=iterator,
chunksize=chunksize,
**kwds,
)
if isinstance(pd_obj, pandas.DataFrame):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,10 @@ def get_col_names():
kwargs["filepath_or_buffer"], nrows=0, engine="c"
).columns.tolist()

if dtype := kwargs["dtype"]:
dtype = kwargs["dtype"]
# For details: https://github.com/pandas-dev/pandas/issues/57024
entire_dataframe_dtype = dtype is not None and not isinstance(dtype, dict)
if dtype:
if isinstance(dtype, dict):
column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()}
else:
Expand All @@ -151,7 +154,9 @@ def get_col_names():
else:
column_types = {}

if parse_dates := kwargs["parse_dates"]:
if parse_dates := (
None if entire_dataframe_dtype else kwargs["parse_dates"]
):
# Either list of column names or list of column indices is supported.
if isinstance(parse_dates, list) and (
all(isinstance(col, str) for col in parse_dates)
Expand Down Expand Up @@ -185,7 +190,7 @@ def get_col_names():
usecols_md = cls._prepare_pyarrow_usecols(kwargs)

po = ParseOptions(
delimiter="\\s+" if kwargs["delim_whitespace"] else delimiter,
delimiter="\\s+" if kwargs["delim_whitespace"] is True else delimiter,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we leave the previous code?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lib.no_default is equivalent for False value

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably mean the opposite. I see, we can leave the new change as is.

from pandas._libs import lib

a = lib.no_default

def f(a):
    if a:
        print("A")
    else:
        print("B")

f(a)
# A

Copy link
Collaborator

@YarShev YarShev Feb 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although, maybe we should make this condition (and all related ones) more explicit?

delimiter="\\s+" if kwargs["delim_whitespace"] and kwargs["delim_whitespace"] is not lib.no_default else delimiter

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably mean the opposite

Let me clarify. lib.no_default is equivalent for True value in if condition, but in pandas, for particular method it's equivalent for False value.

Although, maybe we should make this condition (and all related ones) more explicit?

It seems to me that this is an additional burden on the reader of the code.

quote_char=kwargs["quotechar"],
double_quote=kwargs["doublequote"],
escape_char=kwargs["escapechar"],
Expand Down Expand Up @@ -426,7 +431,7 @@ def _read_csv_check_support(
False,
f"read_csv with 'arrow' engine doesn't support {arg} parameter",
)
if delimiter is not None and read_csv_kwargs["delim_whitespace"]:
if delimiter is not None and read_csv_kwargs["delim_whitespace"] is True:
raise ValueError(
"Specified a delimiter with both sep and delim_whitespace=True; you can only specify one."
)
Expand Down Expand Up @@ -541,7 +546,7 @@ def _validate_read_csv_kwargs(
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
if delim_whitespace is True and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
+ "delim_whitespace=True; you can only specify one."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1440,6 +1440,10 @@ def applier(df, **kwargs):
# TODO: make sure we can ignore this warning
or "Frame contain columns with unsupported data-types"
in message
# Looks like the warning comes from pyarrow, more details:
# https://github.com/pandas-dev/pandas/pull/52419
or "Passing a BlockManager to DataFrame is deprecated"
in message
):
continue
assert (
Expand Down
6 changes: 3 additions & 3 deletions modin/experimental/core/io/sql/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import pandas
import pandas._libs.lib as lib
from sqlalchemy import MetaData, Table, create_engine, inspect
from sqlalchemy import MetaData, Table, create_engine, inspect, text

from modin.core.storage_formats.pandas.parsers import _split_result_for_readers

Expand Down Expand Up @@ -167,9 +167,9 @@ def get_query_columns(engine, query):
Dictionary with columns names and python types.
"""
con = engine.connect()
result = con.execute(query).fetchone()
values = list(result)
result = con.execute(text(query))
YarShev marked this conversation as resolved.
Show resolved Hide resolved
cols_names = list(result.keys())
values = list(result.first())
cols = dict()
for i in range(len(cols_names)):
cols[cols_names[i]] = type(values[i]).__name__
Expand Down
6 changes: 3 additions & 3 deletions modin/experimental/pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,11 @@ def parser_func(
na_values=None,
keep_default_na=True,
na_filter=True,
verbose=False,
verbose=lib.no_default,
skip_blank_lines=True,
parse_dates=None,
infer_datetime_format=lib.no_default,
keep_date_col=False,
keep_date_col=lib.no_default,
date_parser=lib.no_default,
date_format=None,
dayfirst=False,
Expand All @@ -225,7 +225,7 @@ def parser_func(
dialect=None,
on_bad_lines="error",
doublequote=True,
delim_whitespace=False,
delim_whitespace=lib.no_default,
low_memory=True,
memory_map=False,
float_precision=None,
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import pandas
from packaging import version

__pandas_version__ = "2.1"
__pandas_version__ = "2.2"

if (
version.parse(pandas.__version__).release[:2]
Expand Down
56 changes: 41 additions & 15 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def _binary_op(self, op, other, **kwargs):
new_query_compiler = getattr(self._query_compiler, op)(other, **kwargs)
return self._create_or_update_from_compiler(new_query_compiler)

def _default_to_pandas(self, op, *args, **kwargs):
def _default_to_pandas(self, op, *args, reason: str = None, **kwargs):
"""
Convert dataset to pandas type and call a pandas function on it.

Expand All @@ -481,6 +481,7 @@ def _default_to_pandas(self, op, *args, **kwargs):
Name of pandas function.
*args : list
Additional positional arguments to be passed to `op`.
reason : str, optional
**kwargs : dict
Additional keywords arguments to be passed to `op`.

Expand All @@ -495,7 +496,8 @@ def _default_to_pandas(self, op, *args, **kwargs):
type(self).__name__,
op if isinstance(op, str) else op.__name__,
empty_self_str,
)
),
reason=reason,
)

args = try_cast_to_pandas(args)
Expand All @@ -520,15 +522,7 @@ def _default_to_pandas(self, op, *args, **kwargs):
failure_condition=True,
extra_log="{} is an unsupported operation".format(op),
)
# SparseDataFrames cannot be serialized by arrow and cause problems for Modin.
# For now we will use pandas.
if isinstance(result, type(self)) and not isinstance(
result, (pandas.SparseDataFrame, pandas.SparseSeries)
):
return self._create_or_update_from_compiler(
result, inplace=kwargs.get("inplace", False)
)
elif isinstance(result, pandas.DataFrame):
if isinstance(result, pandas.DataFrame):
Dismissed Show dismissed Hide dismissed
from .dataframe import DataFrame

return DataFrame(result)
Expand Down Expand Up @@ -1106,11 +1100,27 @@ def _deprecate_downcast(self, downcast, method_name: str):
return downcast

def bfill(
self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default
self,
*,
axis=None,
inplace=False,
limit=None,
limit_area=None,
downcast=lib.no_default,
): # noqa: PR01, RT01, D200
"""
Synonym for `DataFrame.fillna` with ``method='bfill'``.
"""
if limit_area is not None:
anmyachev marked this conversation as resolved.
Show resolved Hide resolved
return self._default_to_pandas(
"bfill",
reason="'limit_area' parameter isn't supported",
axis=axis,
inplace=inplace,
limit=limit,
limit_area=limit_area,
downcast=downcast,
)
downcast = self._deprecate_downcast(downcast, "bfill")
with warnings.catch_warnings():
warnings.filterwarnings(
Expand Down Expand Up @@ -1599,11 +1609,27 @@ def expanding(
)

def ffill(
self, *, axis=None, inplace=False, limit=None, downcast=lib.no_default
self,
*,
axis=None,
inplace=False,
limit=None,
limit_area=None,
downcast=lib.no_default,
): # noqa: PR01, RT01, D200
"""
Synonym for `DataFrame.fillna` with ``method='ffill'``.
"""
if limit_area is not None:
return self._default_to_pandas(
"ffill",
reason="'limit_area' parameter isn't supported",
axis=axis,
inplace=inplace,
limit=limit,
limit_area=limit_area,
downcast=downcast,
)
downcast = self._deprecate_downcast(downcast, "ffill")
with warnings.catch_warnings():
warnings.filterwarnings(
Expand Down Expand Up @@ -2489,8 +2515,8 @@ def resample(
axis: Axis = lib.no_default,
closed: Optional[str] = None,
label: Optional[str] = None,
convention: str = "start",
kind: Optional[str] = None,
convention: str = lib.no_default,
kind: Optional[str] = lib.no_default,
on: Level = None,
level: Level = None,
origin: Union[str, TimestampConvertibleTypes] = "start_day",
Expand Down
26 changes: 23 additions & 3 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,12 +394,14 @@ def apply(
result_type=None,
args=(),
by_row="compat",
engine="python",
engine_kwargs=None,
**kwargs,
): # noqa: PR01, RT01, D200
"""
Apply a function along an axis of the ``DataFrame``.
"""
if by_row != "compat":
if by_row != "compat" or engine != "python" or engine_kwargs:
# TODO: add test
return self._default_to_pandas(
pandas.DataFrame.apply,
Expand All @@ -409,6 +411,8 @@ def apply(
result_type=result_type,
args=args,
by_row=by_row,
engine=engine,
engine_kwargs=engine_kwargs,
**kwargs,
)

Expand Down Expand Up @@ -1446,7 +1450,7 @@ def pivot_table(
margins=False,
dropna=True,
margins_name="All",
observed=False,
observed=lib.no_default,
sort=True,
): # noqa: PR01, RT01, D200
"""
Expand Down Expand Up @@ -1631,7 +1635,23 @@ def _get_axis_resolvers(self, axis: str) -> dict:
d[axis] = dindex
return d

_get_cleaned_column_resolvers = pandas.DataFrame._get_cleaned_column_resolvers
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: # noqa: RT01
"""
Return the special character free column resolvers of a dataframe.

Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in `DataFrame.eval`.

Notes
-----
Copied from pandas.
"""
from pandas.core.computation.parsing import clean_column_name
YarShev marked this conversation as resolved.
Show resolved Hide resolved

return {
clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
}

def query(self, expr, inplace=False, **kwargs): # noqa: PR01, RT01, D200
"""
Expand Down