Skip to content

Commit

Permalink
Backport PR #51542 on branch 2.0.x (API: ArrowExtensionArray.value_co…
Browse files Browse the repository at this point in the history
…unts returns pyarrow.int64 type) (#51620)

Backport PR #51542: API: ArrowExtensionArray.value_counts returns pyarrow.int64 type

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and mroeschke committed Feb 25, 2023
1 parent 42c3ab1 commit e6255d8
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 75 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,7 @@ Other API changes
- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)

.. note::

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,9 @@ def value_counts(
result.name = name
result.index.name = index_name
counts = result._values
if not isinstance(counts, np.ndarray):
# e.g. ArrowExtensionArray
counts = np.asarray(counts)

elif isinstance(values, ABCMultiIndex):
# GH49558
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,12 +983,11 @@ def value_counts(self, dropna: bool = True) -> Series:
if pa.types.is_duration(pa_type):
values = values.cast(pa_type)

# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(counts)
counts = ArrowExtensionArray(counts)

index = Index(type(self)(values))

return Series(counts, index=index, name="count").astype("Int64")
return Series(counts, index=index, name="count")

@classmethod
def _concat_same_type(
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,20 +453,28 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2):


def test_value_counts_na(dtype):
if getattr(dtype, "storage", "") == "pyarrow":
exp_dtype = "int64[pyarrow]"
else:
exp_dtype = "Int64"
arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype)
result = arr.value_counts(dropna=False)
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype="Int64", name="count")
expected = pd.Series([2, 1, 1], index=arr[[0, 1, 3]], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)

result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=arr[:2], dtype="Int64", name="count")
expected = pd.Series([2, 1], index=arr[:2], dtype=exp_dtype, name="count")
tm.assert_series_equal(result, expected)


def test_value_counts_with_normalize(dtype):
if getattr(dtype, "storage", "") == "pyarrow":
exp_dtype = "double[pyarrow]"
else:
exp_dtype = "Float64"
ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype)
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
expected = pd.Series([2, 1], index=ser[:2], dtype=exp_dtype, name="proportion") / 3
tm.assert_series_equal(result, expected)


Expand Down
14 changes: 10 additions & 4 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ def test_value_counts(index_or_series_obj):
expected.index.name = obj.name

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")

# TODO(GH#32514): Order of entries with the same count is inconsistent
# on CI (gh-32449)
Expand Down Expand Up @@ -90,8 +93,11 @@ def test_value_counts_null(null_obj, index_or_series_obj):
result = result.sort_index()

if not isinstance(result.dtype, np.dtype):
# i.e IntegerDtype
expected = expected.astype("Int64")
if getattr(obj.dtype, "storage", "") == "pyarrow":
expected = expected.astype("int64[pyarrow]")
else:
# i.e IntegerDtype
expected = expected.astype("Int64")
tm.assert_series_equal(result, expected)

expected[null_obj] = 3
Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,9 +793,25 @@ def test_diff(self, data, periods, request):
)
super().test_diff(data, periods)

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
super().test_value_counts(all_data, dropna)
def test_value_counts_returns_pyarrow_int64(self, data):
# GH 51462
data = data[:10]
result = data.value_counts()
assert result.dtype == ArrowDtype(pa.int64())

def test_value_counts_with_normalize(self, data, request):
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)

result = ser.value_counts(normalize=True).sort_index()

expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
expected = expected.astype("double[pyarrow]")

self.assert_series_equal(result, expected)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
Expand Down
73 changes: 11 additions & 62 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under7p0
from pandas.errors import PerformanceWarning

import pandas as pd
Expand Down Expand Up @@ -196,70 +195,20 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):


class TestMethods(base.BaseMethodsTests):
def test_argsort(self, data_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
check_stacklevel=False,
):
super().test_argsort(data_for_sorting)
def test_value_counts_with_normalize(self, data):
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)

def test_argsort_missing(self, data_missing_for_sorting):
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
check_stacklevel=False,
):
super().test_argsort_missing(data_missing_for_sorting)

def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
):
super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value)

@pytest.mark.parametrize(
"op_name, skipna, expected",
[
("idxmax", True, 0),
("idxmin", True, 2),
("argmax", True, 0),
("argmin", True, 2),
("idxmax", False, np.nan),
("idxmin", False, np.nan),
("argmax", False, -1),
("argmin", False, -1),
],
)
def test_argreduce_series(
self, data_missing_for_sorting, op_name, skipna, expected, request
):
super().test_argreduce_series(
data_missing_for_sorting, op_name, skipna, expected
)
result = ser.value_counts(normalize=True).sort_index()

@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
if getattr(data.dtype, "storage", "") == "pyarrow":
expected = expected.astype("double[pyarrow]")
else:
other = all_data
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(all_data.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
with tm.maybe_produces_warning(
PerformanceWarning,
pa_version_under7p0
and getattr(other.dtype, "storage", "") == "pyarrow"
and not (dropna and "data_missing" in request.node.nodeid),
):
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
expected = expected.astype("Float64")

self.assert_series_equal(result, expected)

Expand Down

0 comments on commit e6255d8

Please sign in to comment.