Skip to content

Commit

Permalink
PERF: dtype checks (#52766)
Browse files Browse the repository at this point in the history
* PERF: dtype checks

* mypy fixup
  • Loading branch information
jbrockmendel committed Apr 18, 2023
1 parent c85bbc6 commit d91b04c
Show file tree
Hide file tree
Showing 15 changed files with 67 additions and 79 deletions.
3 changes: 1 addition & 2 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from pandas.core.dtypes.common import (
is_bool,
is_extension_array_dtype,
is_integer_dtype,
is_number,
is_numeric_dtype,
Expand Down Expand Up @@ -316,7 +315,7 @@ def _get_ilevel_values(index, level):
if not left.equals(right):
mismatch = left._values != right._values

if is_extension_array_dtype(mismatch):
if not isinstance(mismatch, np.ndarray):
mismatch = cast("ExtensionArray", mismatch).fillna(True)

diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_signed_integer_dtype,
Expand Down Expand Up @@ -471,7 +470,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:

if (
len(values) > 0
and is_numeric_dtype(values.dtype)
and values.dtype.kind in "iufcb"
and not is_signed_integer_dtype(comps)
):
# GH#46485 Use object to avoid upcast to float64 later
Expand Down Expand Up @@ -1403,7 +1402,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
)

is_timedelta = False
if needs_i8_conversion(arr.dtype):
if arr.dtype.kind in "mM":
dtype = np.int64
arr = arr.view("i8")
na = iNaT
Expand All @@ -1413,7 +1412,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
# We have to cast in order to be able to hold np.nan
dtype = np.object_

elif is_integer_dtype(dtype):
elif dtype.kind in "iu":
# We have to cast in order to be able to hold np.nan

# int8, int16 are incompatible with float64,
Expand Down
9 changes: 4 additions & 5 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
is_bool_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
is_hashable,
is_integer_dtype,
is_list_like,
Expand Down Expand Up @@ -618,7 +617,7 @@ def _from_inferred_categories(

if known_categories:
# Convert to a specialized type with `dtype` if specified.
if is_any_real_numeric_dtype(dtype.categories):
if is_any_real_numeric_dtype(dtype.categories.dtype):
cats = to_numeric(inferred_categories, errors="coerce")
elif lib.is_np_dtype(dtype.categories.dtype, "M"):
cats = to_datetime(inferred_categories, errors="coerce")
Expand Down Expand Up @@ -701,7 +700,7 @@ def from_codes(
)
raise ValueError(msg)

if is_extension_array_dtype(codes) and is_integer_dtype(codes):
if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):
# Avoid the implicit conversion of Int to object
if isna(codes).any():
raise ValueError("codes cannot contain NA values")
Expand Down Expand Up @@ -1598,7 +1597,7 @@ def _internal_get_values(self):
# if we are a datetime and period index, return Index to keep metadata
if needs_i8_conversion(self.categories.dtype):
return self.categories.take(self._codes, fill_value=NaT)
elif is_integer_dtype(self.categories) and -1 in self._codes:
elif is_integer_dtype(self.categories.dtype) and -1 in self._codes:
return self.categories.astype("object").take(self._codes, fill_value=np.nan)
return np.array(self)

Expand Down Expand Up @@ -1809,7 +1808,7 @@ def _values_for_rank(self) -> np.ndarray:
if mask.any():
values = values.astype("float64")
values[mask] = np.nan
elif is_any_real_numeric_dtype(self.categories):
elif is_any_real_numeric_dtype(self.categories.dtype):
values = np.array(self)
else:
# reorder the categories (so rank can use the float codes)
Expand Down
7 changes: 1 addition & 6 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
is_datetime64_any_dtype,
is_dtype_equal,
is_float_dtype,
is_object_dtype,
is_sparse,
is_string_dtype,
pandas_dtype,
Expand Down Expand Up @@ -2038,11 +2037,7 @@ def _sequence_to_dt64ns(
if out_unit is not None:
out_dtype = np.dtype(f"M8[{out_unit}]")

if (
is_object_dtype(data_dtype)
or is_string_dtype(data_dtype)
or is_sparse(data_dtype)
):
if data_dtype == object or is_string_dtype(data_dtype) or is_sparse(data_dtype):
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
Expand Down
32 changes: 17 additions & 15 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,9 @@
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
is_bool,
is_bool_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
Expand Down Expand Up @@ -408,9 +405,11 @@ def to_numpy(
na_value = libmissing.NA
if dtype is None:
dtype = object
else:
dtype = np.dtype(dtype)
if self._hasna:
if (
not is_object_dtype(dtype)
dtype != object
and not is_string_dtype(dtype)
and na_value is libmissing.NA
):
Expand Down Expand Up @@ -545,7 +544,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
else:
inputs2.append(x)

def reconstruct(x):
def reconstruct(x: np.ndarray):
# we don't worry about scalar `x` here, since we
# raise for reduce up above.
from pandas.core.arrays import (
Expand All @@ -554,13 +553,13 @@ def reconstruct(x):
IntegerArray,
)

if is_bool_dtype(x.dtype):
if x.dtype.kind == "b":
m = mask.copy()
return BooleanArray(x, m)
elif is_integer_dtype(x.dtype):
elif x.dtype.kind in "iu":
m = mask.copy()
return IntegerArray(x, m)
elif is_float_dtype(x.dtype):
elif x.dtype.kind == "f":
m = mask.copy()
if x.dtype == np.float16:
# reached in e.g. np.sqrt on BooleanArray
Expand Down Expand Up @@ -763,7 +762,9 @@ def _cmp_method(self, other, op) -> BooleanArray:
mask = self._propagate_mask(mask, other)
return BooleanArray(result, mask, copy=False)

def _maybe_mask_result(self, result, mask):
def _maybe_mask_result(
self, result: np.ndarray | tuple[np.ndarray, np.ndarray], mask: np.ndarray
):
"""
Parameters
----------
Expand All @@ -778,12 +779,12 @@ def _maybe_mask_result(self, result, mask):
self._maybe_mask_result(mod, mask),
)

if is_float_dtype(result.dtype):
if result.dtype.kind == "f":
from pandas.core.arrays import FloatingArray

return FloatingArray(result, mask, copy=False)

elif is_bool_dtype(result.dtype):
elif result.dtype.kind == "b":
from pandas.core.arrays import BooleanArray

return BooleanArray(result, mask, copy=False)
Expand All @@ -794,13 +795,14 @@ def _maybe_mask_result(self, result, mask):
# e.g. test_numeric_arr_mul_tdscalar_numexpr_path
from pandas.core.arrays import TimedeltaArray

result[mask] = result.dtype.type("NaT")

if not isinstance(result, TimedeltaArray):
result = TimedeltaArray._simple_new(result, dtype=result.dtype)
return TimedeltaArray._simple_new(result, dtype=result.dtype)

result[mask] = result.dtype.type("NaT")
return result

elif is_integer_dtype(result.dtype):
elif result.dtype.kind in "iu":
from pandas.core.arrays import IntegerArray

return IntegerArray(result, mask, copy=False)
Expand Down Expand Up @@ -875,7 +877,7 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]
result = isin(self._data, values_arr)

if self._hasna:
values_have_NA = is_object_dtype(values_arr.dtype) and any(
values_have_NA = values_arr.dtype == object and any(
val is self.dtype.na_value for val in values_arr
)

Expand Down
9 changes: 4 additions & 5 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,14 @@
from pandas.core.dtypes.common import (
TD64NS_DTYPE,
is_dtype_equal,
is_extension_array_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
is_timedelta64_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import isna

from pandas.core import (
Expand Down Expand Up @@ -137,7 +136,7 @@ class TimedeltaArray(dtl.TimelikeOps):
_typ = "timedeltaarray"
_internal_fill_value = np.timedelta64("NaT", "ns")
_recognized_scalars = (timedelta, np.timedelta64, Tick)
_is_recognized_dtype = is_timedelta64_dtype
_is_recognized_dtype = lambda x: lib.is_np_dtype(x, "m")
_infer_matches = ("timedelta", "timedelta64")

@property
Expand Down Expand Up @@ -912,7 +911,7 @@ def sequence_to_td64ns(
inferred_freq = data.freq

# Convert whatever we have into timedelta64[ns] dtype
if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
if data.dtype == object or is_string_dtype(data.dtype):
# no need to make a copy, need to convert if string-dtyped
data = _objects_to_td64ns(data, unit=unit, errors=errors)
copy = False
Expand All @@ -925,7 +924,7 @@ def sequence_to_td64ns(
elif is_float_dtype(data.dtype):
# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
if is_extension_array_dtype(data.dtype):
if isinstance(data.dtype, ExtensionDtype):
mask = data._mask
data = data._data
else:
Expand Down
13 changes: 5 additions & 8 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@
is_extension_array_dtype,
is_float,
is_integer,
is_integer_dtype,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
Expand Down Expand Up @@ -472,7 +470,7 @@ def maybe_cast_pointwise_result(
else:
result = maybe_cast_to_extension_array(cls, result)

elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
elif (numeric_only and dtype.kind in "iufcb") or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result
Expand Down Expand Up @@ -1041,13 +1039,13 @@ def convert_dtypes(
if convert_integer:
target_int_dtype = pandas_dtype_func("Int64")

if is_integer_dtype(input_array.dtype):
if input_array.dtype.kind in "iu":
from pandas.core.arrays.integer import INT_STR_TO_DTYPE

inferred_dtype = INT_STR_TO_DTYPE.get(
input_array.dtype.name, target_int_dtype
)
elif is_numeric_dtype(input_array.dtype):
elif input_array.dtype.kind in "fcb":
# TODO: de-dup with maybe_cast_to_integer_array?
arr = input_array[notna(input_array)]
if (arr.astype(int) == arr).all():
Expand All @@ -1062,9 +1060,8 @@ def convert_dtypes(
inferred_dtype = target_int_dtype

if convert_floating:
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
input_array.dtype
):
if input_array.dtype.kind in "fcb":
# i.e. numeric but not integer
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE

inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def classes(*klasses) -> Callable:
return lambda tipo: issubclass(tipo, klasses)


def classes_and_not_datetimelike(*klasses) -> Callable:
def _classes_and_not_datetimelike(*klasses) -> Callable:
"""
Evaluate if the tipo is a subclass of the klasses
and not a datetimelike.
Expand Down Expand Up @@ -654,7 +654,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
False
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.integer)
arr_or_dtype, _classes_and_not_datetimelike(np.integer)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "iu"
)
Expand Down Expand Up @@ -713,7 +713,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
False
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)
arr_or_dtype, _classes_and_not_datetimelike(np.signedinteger)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "i"
)
Expand Down Expand Up @@ -763,7 +763,7 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool:
True
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)
arr_or_dtype, _classes_and_not_datetimelike(np.unsignedinteger)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "u"
)
Expand Down Expand Up @@ -1087,7 +1087,7 @@ def is_numeric_dtype(arr_or_dtype) -> bool:
False
"""
return _is_dtype_type(
arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)
arr_or_dtype, _classes_and_not_datetimelike(np.number, np.bool_)
) or _is_dtype(
arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ._is_numeric
)
Expand Down Expand Up @@ -1490,7 +1490,7 @@ def infer_dtype_from_object(dtype) -> type:
except TypeError:
pass

if is_extension_array_dtype(dtype):
if isinstance(dtype, ExtensionDtype):
return dtype.type
elif isinstance(dtype, str):
# TODO(jreback)
Expand Down Expand Up @@ -1644,7 +1644,6 @@ def is_all_strings(value: ArrayLike) -> bool:

__all__ = [
"classes",
"classes_and_not_datetimelike",
"DT64NS_DTYPE",
"ensure_float64",
"ensure_python_int",
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6138,7 +6138,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
elif is_numeric_dtype(self.dtype):
return is_numeric_dtype(dtype)
# TODO: this was written assuming we only get here with object-dtype,
# which is nom longer correct. Can we specialize for EA?
# which is no longer correct. Can we specialize for EA?
return True

@final
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
ensure_platform_int,
is_datetime64_ns_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_integer,
is_numeric_dtype,
is_object_dtype,
Expand Down Expand Up @@ -1125,7 +1124,7 @@ def as_array(
dtype = dtype.subtype
elif isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype
elif is_extension_array_dtype(dtype):
elif isinstance(dtype, ExtensionDtype):
dtype = "object"
elif is_dtype_equal(dtype, str):
dtype = "object"
Expand Down

0 comments on commit d91b04c

Please sign in to comment.