Skip to content

Commit

Permalink
Backport PR #53001 on branch 2.0.x (BUG: Series.describe treating pya…
Browse files Browse the repository at this point in the history
…rrow timestamps/timedeltas as categorical) (#53031)

* Backport PR #53001: BUG: Series.describe treating pyarrow timestamps/timedeltas as categorical

* clean
  • Loading branch information
lukemanley committed May 2, 2023
1 parent fbbdac5 commit 9135c3a
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Bug fixes
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`)
- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`)
- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
-

Expand Down
15 changes: 9 additions & 6 deletions pandas/core/methods/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,10 @@
from pandas.core.dtypes.common import (
is_bool_dtype,
is_complex_dtype,
is_datetime64_any_dtype,
is_extension_array_dtype,
is_numeric_dtype,
is_timedelta64_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype

from pandas.core.arrays.arrow.dtype import ArrowDtype
from pandas.core.arrays.floating import Float64Dtype
Expand Down Expand Up @@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
dtype: DtypeObj | None
if is_extension_array_dtype(series):
if isinstance(series.dtype, ArrowDtype):
import pyarrow as pa
if series.dtype.kind == "m":
# GH53001: describe timedeltas with object dtype
dtype = None
else:
import pyarrow as pa

dtype = ArrowDtype(pa.float64())
dtype = ArrowDtype(pa.float64())
else:
dtype = Float64Dtype()
elif is_numeric_dtype(series) and not is_complex_dtype(series):
Expand Down Expand Up @@ -362,9 +365,9 @@ def select_describe_func(
return describe_categorical_1d
elif is_numeric_dtype(data):
return describe_numeric_1d
elif is_datetime64_any_dtype(data.dtype):
elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
return describe_timestamp_1d
elif is_timedelta64_dtype(data.dtype):
elif data.dtype.kind == "m":
return describe_numeric_1d
else:
return describe_categorical_1d
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2658,6 +2658,36 @@ def test_describe_numeric_data(pa_type):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
def test_describe_timedelta_data(pa_type):
# GH53001
data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
result = data.describe()
expected = pd.Series(
[9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
dtype=object,
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
def test_describe_datetime_data(pa_type):
# GH53001
data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
result = data.describe()
expected = pd.Series(
[9]
+ [
pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
for v in [5, 1, 3, 5, 7, 9]
],
dtype=object,
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(
pa_version_under8p0,
reason="Function 'add_checked' has no kernel matching input types",
Expand Down

0 comments on commit 9135c3a

Please sign in to comment.