From a2bce66d04ed2addfb9782f0e824c60bc0b1b449 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 29 Apr 2024 05:28:46 -0700 Subject: [PATCH] REF: move MaskedArray subclass attributes to dtypes (#58423) --- pandas/_libs/lib.pyx | 4 +-- pandas/core/arrays/boolean.py | 10 ++---- pandas/core/arrays/floating.py | 10 ++---- pandas/core/arrays/integer.py | 10 ++---- pandas/core/arrays/masked.py | 60 +++++++++++++--------------------- pandas/core/arrays/numeric.py | 2 +- pandas/core/dtypes/dtypes.py | 20 ++++++++++++ 7 files changed, 53 insertions(+), 63 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5b6d83ba8e9ee..4fd68a1593e49 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2808,14 +2808,14 @@ def maybe_convert_objects(ndarray[object] objects, from pandas.core.arrays import IntegerArray # Set these values to 1 to be deterministic, match - # IntegerArray._internal_fill_value + # IntegerDtype._internal_fill_value result[mask] = 1 result = IntegerArray(result, mask) elif result is floats and convert_to_nullable_dtype: from pandas.core.arrays import FloatingArray # Set these values to 1.0 to be deterministic, match - # FloatingArray._internal_fill_value + # FloatingDtype._internal_fill_value result[mask] = 1.0 result = FloatingArray(result, mask) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 813b10eef5e4b..a326925545045 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -68,6 +68,9 @@ class BooleanDtype(BaseMaskedDtype): name: ClassVar[str] = "boolean" + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + # https://github.com/python/mypy/issues/4125 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" @property @@ -293,13 +296,6 @@ class BooleanArray(BaseMaskedArray): Length: 3, dtype: boolean """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = False - # Fill values used for any/all - # Incompatible types in assignment (expression has type "bool", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = True # type: ignore[assignment] - _falsey_value = False # type: ignore[assignment] _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 653e63e9d1e2d..b3fbf0f92c32d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -23,6 +23,8 @@ class FloatingDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = np.nan _default_np_dtype = np.dtype(np.float64) _checker = is_float_dtype @@ -113,14 +115,6 @@ class FloatingArray(NumericArray): _dtype_cls = FloatingDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = np.nan - # Fill values used for any/all - # Incompatible types in assignment (expression has type "float", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1.0 # type: ignore[assignment] - _falsey_value = 0.0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index dc453f3e37c50..21a9b09227663 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -23,6 +23,8 @@ class IntegerDtype(NumericDtype): The attributes name & type are set when these subclasses are created. """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 _default_np_dtype = np.dtype(np.int64) _checker = is_integer_dtype @@ -128,14 +130,6 @@ class IntegerArray(NumericArray): _dtype_cls = IntegerDtype - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = 1 - # Fill values used for any/all - # Incompatible types in assignment (expression has type "int", base class - # "BaseMaskedArray" defined the type as "") - _truthy_value = 1 # type: ignore[assignment] - _falsey_value = 0 # type: ignore[assignment] - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 190888d281ea9..df794183f67d1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,6 +5,7 @@ Any, Callable, Literal, + cast, overload, ) import warnings @@ -16,22 +17,6 @@ missing as libmissing, ) from pandas._libs.tslibs import is_supported_dtype -from pandas._typing import ( - ArrayLike, - AstypeArg, - AxisInt, - DtypeObj, - FillnaOptions, - InterpolateOptions, - NpDtype, - PositionalIndexer, - Scalar, - ScalarIndexer, - Self, - SequenceIndexer, - Shape, - npt, -) from pandas.compat import ( IS64, is_platform_windows, @@ -97,6 +82,20 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + ArrayLike, + AstypeArg, + AxisInt, + DtypeObj, + FillnaOptions, + InterpolateOptions, + NpDtype, + PositionalIndexer, + Scalar, + ScalarIndexer, + Self, + SequenceIndexer, + Shape, + npt, ) from pandas._libs.missing import NAType from pandas.core.arrays import FloatingArray @@ -111,16 +110,10 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): numpy based """ - # The value used to fill '_data' to avoid upcasting - _internal_fill_value: Scalar # our underlying data and mask are each ndarrays _data: np.ndarray _mask: npt.NDArray[np.bool_] - # Fill values used for any/all - _truthy_value = Scalar # bool(_truthy_value) = True - _falsey_value = Scalar # bool(_falsey_value) = False - @classmethod def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self: result = BaseMaskedArray.__new__(cls) @@ -155,8 +148,9 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: @classmethod @doc(ExtensionArray._empty) def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: - values = np.empty(shape, dtype=dtype.type) - values.fill(cls._internal_fill_value) + dtype = cast(BaseMaskedDtype, dtype) + values: np.ndarray = np.empty(shape, dtype=dtype.type) + values.fill(dtype._internal_fill_value) mask = np.ones(shape, dtype=bool) result = cls(values, mask) if not isinstance(result, cls) or dtype != result.dtype: @@ -917,7 +911,9 @@ def take( ) -> Self: # we always fill with 1 internally # to avoid upcasting - data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + data_fill_value = ( + self.dtype._internal_fill_value if isna(fill_value) else fill_value + ) result = take( self._data, indexer, @@ -1397,12 +1393,7 @@ def any( nv.validate_any((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._falsey_value) result = values.any() if skipna: return result @@ -1490,12 +1481,7 @@ def all( nv.validate_all((), kwargs) values = self._data.copy() - # error: Argument 3 to "putmask" has incompatible type "object"; - # expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], - # bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self.dtype._truthy_value) result = values.all(axis=axis) if skipna: diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index fe7b32ec9652e..c5e9ed8698ffe 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -221,7 +221,7 @@ def _coerce_to_data_and_mask( # we copy as need to coerce here if mask.any(): values = values.copy() - values[mask] = cls._internal_fill_value + values[mask] = dtype_cls._internal_fill_value if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into a float diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 778b6bd6f3f18..8c64a38bc1be3 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -79,6 +79,7 @@ DtypeObj, IntervalClosedType, Ordered, + Scalar, Self, npt, type_t, @@ -1551,6 +1552,25 @@ class BaseMaskedDtype(ExtensionDtype): base = None type: type + _internal_fill_value: Scalar + + @property + def _truthy_value(self): + # Fill values used for 'any' + if self.kind == "f": + return 1.0 + if self.kind in "iu": + return 1 + return True + + @property + def _falsey_value(self): + # Fill values used for 'all' + if self.kind == "f": + return 0.0 + if self.kind in "iu": + return 0 + return False @property def na_value(self) -> libmissing.NAType: