PERF: dtype checks (#52766)

* PERF: dtype checks * mypy fixup
pandas-dev · Apr 18, 2023 · d91b04c · d91b04c
1 parent c85bbc6
commit d91b04c
Show file tree

Hide file tree

Showing 15 changed files with 67 additions and 79 deletions.
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -16,7 +16,6 @@
 
 from pandas.core.dtypes.common import (
     is_bool,
-    is_extension_array_dtype,
     is_integer_dtype,
     is_number,
     is_numeric_dtype,
@@ -316,7 +315,7 @@ def _get_ilevel_values(index, level):
         if not left.equals(right):
             mismatch = left._values != right._values
 
-            if is_extension_array_dtype(mismatch):
+            if not isinstance(mismatch, np.ndarray):
                 mismatch = cast("ExtensionArray", mismatch).fillna(True)
 
             diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -49,7 +49,6 @@
     is_integer,
     is_integer_dtype,
     is_list_like,
-    is_numeric_dtype,
     is_object_dtype,
     is_scalar,
     is_signed_integer_dtype,
@@ -471,7 +470,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
 
         if (
             len(values) > 0
-            and is_numeric_dtype(values.dtype)
+            and values.dtype.kind in "iufcb"
             and not is_signed_integer_dtype(comps)
         ):
             # GH#46485 Use object to avoid upcast to float64 later
@@ -1403,7 +1402,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
             )
 
     is_timedelta = False
-    if needs_i8_conversion(arr.dtype):
+    if arr.dtype.kind in "mM":
         dtype = np.int64
         arr = arr.view("i8")
         na = iNaT
@@ -1413,7 +1412,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
         # We have to cast in order to be able to hold np.nan
         dtype = np.object_
 
-    elif is_integer_dtype(dtype):
+    elif dtype.kind in "iu":
         # We have to cast in order to be able to hold np.nan
 
         # int8, int16 are incompatible with float64,

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -40,7 +40,6 @@
     is_bool_dtype,
     is_dict_like,
     is_dtype_equal,
-    is_extension_array_dtype,
     is_hashable,
     is_integer_dtype,
     is_list_like,
@@ -618,7 +617,7 @@ def _from_inferred_categories(
 
         if known_categories:
             # Convert to a specialized type with `dtype` if specified.
-            if is_any_real_numeric_dtype(dtype.categories):
+            if is_any_real_numeric_dtype(dtype.categories.dtype):
                 cats = to_numeric(inferred_categories, errors="coerce")
             elif lib.is_np_dtype(dtype.categories.dtype, "M"):
                 cats = to_datetime(inferred_categories, errors="coerce")
@@ -701,7 +700,7 @@ def from_codes(
             )
             raise ValueError(msg)
 
-        if is_extension_array_dtype(codes) and is_integer_dtype(codes):
+        if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):
             # Avoid the implicit conversion of Int to object
             if isna(codes).any():
                 raise ValueError("codes cannot contain NA values")
@@ -1598,7 +1597,7 @@ def _internal_get_values(self):
         # if we are a datetime and period index, return Index to keep metadata
         if needs_i8_conversion(self.categories.dtype):
             return self.categories.take(self._codes, fill_value=NaT)
-        elif is_integer_dtype(self.categories) and -1 in self._codes:
+        elif is_integer_dtype(self.categories.dtype) and -1 in self._codes:
             return self.categories.astype("object").take(self._codes, fill_value=np.nan)
         return np.array(self)
 
@@ -1809,7 +1808,7 @@ def _values_for_rank(self) -> np.ndarray:
             if mask.any():
                 values = values.astype("float64")
                 values[mask] = np.nan
-        elif is_any_real_numeric_dtype(self.categories):
+        elif is_any_real_numeric_dtype(self.categories.dtype):
             values = np.array(self)
         else:
             #  reorder the categories (so rank can use the float codes)

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -53,7 +53,6 @@
     is_datetime64_any_dtype,
     is_dtype_equal,
     is_float_dtype,
-    is_object_dtype,
     is_sparse,
     is_string_dtype,
     pandas_dtype,
@@ -2038,11 +2037,7 @@ def _sequence_to_dt64ns(
     if out_unit is not None:
         out_dtype = np.dtype(f"M8[{out_unit}]")
 
-    if (
-        is_object_dtype(data_dtype)
-        or is_string_dtype(data_dtype)
-        or is_sparse(data_dtype)
-    ):
+    if data_dtype == object or is_string_dtype(data_dtype) or is_sparse(data_dtype):
         # TODO: We do not have tests specific to string-dtypes,
         #  also complex or categorical or other extension
         copy = False

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -41,12 +41,9 @@
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import (
     is_bool,
-    is_bool_dtype,
     is_dtype_equal,
-    is_float_dtype,
     is_integer_dtype,
     is_list_like,
-    is_object_dtype,
     is_scalar,
     is_string_dtype,
     pandas_dtype,
@@ -408,9 +405,11 @@ def to_numpy(
             na_value = libmissing.NA
         if dtype is None:
             dtype = object
+        else:
+            dtype = np.dtype(dtype)
         if self._hasna:
             if (
-                not is_object_dtype(dtype)
+                dtype != object
                 and not is_string_dtype(dtype)
                 and na_value is libmissing.NA
             ):
@@ -545,7 +544,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
             else:
                 inputs2.append(x)
 
-        def reconstruct(x):
+        def reconstruct(x: np.ndarray):
             # we don't worry about scalar `x` here, since we
             # raise for reduce up above.
             from pandas.core.arrays import (
@@ -554,13 +553,13 @@ def reconstruct(x):
                 IntegerArray,
             )
 
-            if is_bool_dtype(x.dtype):
+            if x.dtype.kind == "b":
                 m = mask.copy()
                 return BooleanArray(x, m)
-            elif is_integer_dtype(x.dtype):
+            elif x.dtype.kind in "iu":
                 m = mask.copy()
                 return IntegerArray(x, m)
-            elif is_float_dtype(x.dtype):
+            elif x.dtype.kind == "f":
                 m = mask.copy()
                 if x.dtype == np.float16:
                     # reached in e.g. np.sqrt on BooleanArray
@@ -763,7 +762,9 @@ def _cmp_method(self, other, op) -> BooleanArray:
         mask = self._propagate_mask(mask, other)
         return BooleanArray(result, mask, copy=False)
 
-    def _maybe_mask_result(self, result, mask):
+    def _maybe_mask_result(
+        self, result: np.ndarray | tuple[np.ndarray, np.ndarray], mask: np.ndarray
+    ):
         """
         Parameters
         ----------
@@ -778,12 +779,12 @@ def _maybe_mask_result(self, result, mask):
                 self._maybe_mask_result(mod, mask),
             )
 
-        if is_float_dtype(result.dtype):
+        if result.dtype.kind == "f":
             from pandas.core.arrays import FloatingArray
 
             return FloatingArray(result, mask, copy=False)
 
-        elif is_bool_dtype(result.dtype):
+        elif result.dtype.kind == "b":
             from pandas.core.arrays import BooleanArray
 
             return BooleanArray(result, mask, copy=False)
@@ -794,13 +795,14 @@ def _maybe_mask_result(self, result, mask):
             # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
             from pandas.core.arrays import TimedeltaArray
 
+            result[mask] = result.dtype.type("NaT")
+
             if not isinstance(result, TimedeltaArray):
-                result = TimedeltaArray._simple_new(result, dtype=result.dtype)
+                return TimedeltaArray._simple_new(result, dtype=result.dtype)
 
-            result[mask] = result.dtype.type("NaT")
             return result
 
-        elif is_integer_dtype(result.dtype):
+        elif result.dtype.kind in "iu":
             from pandas.core.arrays import IntegerArray
 
             return IntegerArray(result, mask, copy=False)
@@ -875,7 +877,7 @@ def isin(self, values) -> BooleanArray:  # type: ignore[override]
         result = isin(self._data, values_arr)
 
         if self._hasna:
-            values_have_NA = is_object_dtype(values_arr.dtype) and any(
+            values_have_NA = values_arr.dtype == object and any(
                 val is self.dtype.na_value for val in values_arr
             )
 

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -47,15 +47,14 @@
 from pandas.core.dtypes.common import (
     TD64NS_DTYPE,
     is_dtype_equal,
-    is_extension_array_dtype,
     is_float_dtype,
     is_integer_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
-    is_timedelta64_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import (
@@ -137,7 +136,7 @@ class TimedeltaArray(dtl.TimelikeOps):
     _typ = "timedeltaarray"
     _internal_fill_value = np.timedelta64("NaT", "ns")
     _recognized_scalars = (timedelta, np.timedelta64, Tick)
-    _is_recognized_dtype = is_timedelta64_dtype
+    _is_recognized_dtype = lambda x: lib.is_np_dtype(x, "m")
     _infer_matches = ("timedelta", "timedelta64")
 
     @property
@@ -912,7 +911,7 @@ def sequence_to_td64ns(
         inferred_freq = data.freq
 
     # Convert whatever we have into timedelta64[ns] dtype
-    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
+    if data.dtype == object or is_string_dtype(data.dtype):
         # no need to make a copy, need to convert if string-dtyped
         data = _objects_to_td64ns(data, unit=unit, errors=errors)
         copy = False
@@ -925,7 +924,7 @@ def sequence_to_td64ns(
     elif is_float_dtype(data.dtype):
         # cast the unit, multiply base/frac separately
         # to avoid precision issues from float -> int
-        if is_extension_array_dtype(data.dtype):
+        if isinstance(data.dtype, ExtensionDtype):
             mask = data._mask
             data = data._data
         else:

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -53,8 +53,6 @@
     is_extension_array_dtype,
     is_float,
     is_integer,
-    is_integer_dtype,
-    is_numeric_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -472,7 +470,7 @@ def maybe_cast_pointwise_result(
             else:
                 result = maybe_cast_to_extension_array(cls, result)
 
-    elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
+    elif (numeric_only and dtype.kind in "iufcb") or not numeric_only:
         result = maybe_downcast_to_dtype(result, dtype)
 
     return result
@@ -1041,13 +1039,13 @@ def convert_dtypes(
         if convert_integer:
             target_int_dtype = pandas_dtype_func("Int64")
 
-            if is_integer_dtype(input_array.dtype):
+            if input_array.dtype.kind in "iu":
                 from pandas.core.arrays.integer import INT_STR_TO_DTYPE
 
                 inferred_dtype = INT_STR_TO_DTYPE.get(
                     input_array.dtype.name, target_int_dtype
                 )
-            elif is_numeric_dtype(input_array.dtype):
+            elif input_array.dtype.kind in "fcb":
                 # TODO: de-dup with maybe_cast_to_integer_array?
                 arr = input_array[notna(input_array)]
                 if (arr.astype(int) == arr).all():
@@ -1062,9 +1060,8 @@ def convert_dtypes(
                 inferred_dtype = target_int_dtype
 
         if convert_floating:
-            if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
-                input_array.dtype
-            ):
+            if input_array.dtype.kind in "fcb":
+                # i.e. numeric but not integer
                 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
 
                 inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -122,7 +122,7 @@ def classes(*klasses) -> Callable:
     return lambda tipo: issubclass(tipo, klasses)
 
 
-def classes_and_not_datetimelike(*klasses) -> Callable:
+def _classes_and_not_datetimelike(*klasses) -> Callable:
     """
     Evaluate if the tipo is a subclass of the klasses
     and not a datetimelike.
@@ -654,7 +654,7 @@ def is_integer_dtype(arr_or_dtype) -> bool:
     False
     """
     return _is_dtype_type(
-        arr_or_dtype, classes_and_not_datetimelike(np.integer)
+        arr_or_dtype, _classes_and_not_datetimelike(np.integer)
     ) or _is_dtype(
         arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "iu"
     )
@@ -713,7 +713,7 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool:
     False
     """
     return _is_dtype_type(
-        arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)
+        arr_or_dtype, _classes_and_not_datetimelike(np.signedinteger)
     ) or _is_dtype(
         arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "i"
     )
@@ -763,7 +763,7 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool:
     True
     """
     return _is_dtype_type(
-        arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)
+        arr_or_dtype, _classes_and_not_datetimelike(np.unsignedinteger)
     ) or _is_dtype(
         arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "u"
     )
@@ -1087,7 +1087,7 @@ def is_numeric_dtype(arr_or_dtype) -> bool:
     False
     """
     return _is_dtype_type(
-        arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)
+        arr_or_dtype, _classes_and_not_datetimelike(np.number, np.bool_)
     ) or _is_dtype(
         arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ._is_numeric
     )
@@ -1490,7 +1490,7 @@ def infer_dtype_from_object(dtype) -> type:
     except TypeError:
         pass
 
-    if is_extension_array_dtype(dtype):
+    if isinstance(dtype, ExtensionDtype):
         return dtype.type
     elif isinstance(dtype, str):
         # TODO(jreback)
@@ -1644,7 +1644,6 @@ def is_all_strings(value: ArrayLike) -> bool:
 
 __all__ = [
     "classes",
-    "classes_and_not_datetimelike",
     "DT64NS_DTYPE",
     "ensure_float64",
     "ensure_python_int",

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -6138,7 +6138,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
         elif is_numeric_dtype(self.dtype):
             return is_numeric_dtype(dtype)
         # TODO: this was written assuming we only get here with object-dtype,
-        #  which is nom longer correct. Can we specialize for EA?
+        #  which is no longer correct. Can we specialize for EA?
         return True
 
     @final

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -29,7 +29,6 @@
     ensure_platform_int,
     is_datetime64_ns_dtype,
     is_dtype_equal,
-    is_extension_array_dtype,
     is_integer,
     is_numeric_dtype,
     is_object_dtype,
@@ -1125,7 +1124,7 @@ def as_array(
             dtype = dtype.subtype
         elif isinstance(dtype, PandasDtype):
             dtype = dtype.numpy_dtype
-        elif is_extension_array_dtype(dtype):
+        elif isinstance(dtype, ExtensionDtype):
             dtype = "object"
         elif is_dtype_equal(dtype, str):
             dtype = "object"