PERF: dtype checks (#52682)

* PERF: faster dtype checks * PERF: dtype checks
pandas-dev · Apr 17, 2023 · 34b3e7f · 34b3e7f
1 parent 4017e9c
commit 34b3e7f
Show file tree

Hide file tree

Showing 33 changed files with 109 additions and 155 deletions.
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -29,7 +29,6 @@
 
 from pandas.core.dtypes.common import (
     is_float_dtype,
-    is_integer_dtype,
     is_sequence,
     is_signed_integer_dtype,
     is_unsigned_integer_dtype,
@@ -389,11 +388,11 @@ def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index:
     dtype = pandas_dtype(dtype)
     assert isinstance(dtype, np.dtype)
 
-    if is_integer_dtype(dtype):
+    if dtype.kind in "iu":
         values = np.arange(k, dtype=dtype)
         if is_unsigned_integer_dtype(dtype):
             values += 2 ** (dtype.itemsize * 8 - 1)
-    elif is_float_dtype(dtype):
+    elif dtype.kind == "f":
         values = np.random.random_sample(k) - np.random.random_sample(1)
         values.sort()
         values = values * (10 ** np.random.randint(0, 9))

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -510,7 +510,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
     if (
         len(comps_array) > 1_000_000
         and len(values) <= 26
-        and not is_object_dtype(comps_array)
+        and comps_array.dtype != object
     ):
         # If the values include nan we need to check for nan explicitly
         # since np.nan it not equal to np.nan
@@ -766,7 +766,7 @@ def factorize(
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
 
-        if not use_na_sentinel and is_object_dtype(values):
+        if not use_na_sentinel and values.dtype == object:
             # factorize can now handle differentiating various types of null values.
             # These can only occur when the array has object dtype.
             # However, for backwards compatibility we only use the null for the
@@ -1317,7 +1317,7 @@ def searchsorted(
 
     if (
         isinstance(arr, np.ndarray)
-        and is_integer_dtype(arr.dtype)
+        and arr.dtype.kind in "iu"
         and (is_integer(value) or is_integer_dtype(value))
     ):
         # if `arr` and `value` have different dtypes, `arr` would be

diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py
@@ -12,12 +12,6 @@
 
 import numpy as np
 
-from pandas.core.dtypes.common import (
-    is_bool_dtype,
-    is_float_dtype,
-    is_integer_dtype,
-)
-
 if TYPE_CHECKING:
     from pandas._typing import npt
 
@@ -46,11 +40,11 @@ def _cum_func(
         Whether to skip NA.
     """
     dtype_info: np.iinfo | np.finfo
-    if is_float_dtype(values):
+    if values.dtype.kind == "f":
         dtype_info = np.finfo(values.dtype.type)
-    elif is_integer_dtype(values):
+    elif values.dtype.kind in "iu":
         dtype_info = np.iinfo(values.dtype.type)
-    elif is_bool_dtype(values):
+    elif values.dtype.kind == "b":
         # Max value of bool is 1, but since we are setting into a boolean
         # array, 255 is fine as well. Min value has to be 0 when setting
         # into the boolean array.

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -43,7 +43,6 @@
     is_array_like,
     is_bool_dtype,
     is_integer,
-    is_integer_dtype,
     is_list_like,
     is_object_dtype,
     is_scalar,
@@ -363,9 +362,9 @@ def __getitem__(self, item: PositionalIndexer):
                 else:
                     pa_dtype = self._dtype.pyarrow_dtype
                 return type(self)(pa.chunked_array([], type=pa_dtype))
-            elif is_integer_dtype(item.dtype):
+            elif item.dtype.kind in "iu":
                 return self.take(item)
-            elif is_bool_dtype(item.dtype):
+            elif item.dtype.kind == "b":
                 return type(self)(self._pa_array.filter(item))
             else:
                 raise IndexError(

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -533,7 +533,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
         elif isinstance(dtype, ExtensionDtype):
             return super().astype(dtype, copy=copy)
 
-        elif is_integer_dtype(dtype) and self.isna().any():
+        elif dtype.kind in "iu" and self.isna().any():
             raise ValueError("Cannot convert float NaN to integer")
 
         elif len(self.codes) == 0 or len(self.categories) == 0:
@@ -624,7 +624,7 @@ def _from_inferred_categories(
                 cats = to_datetime(inferred_categories, errors="coerce")
             elif lib.is_np_dtype(dtype.categories.dtype, "m"):
                 cats = to_timedelta(inferred_categories, errors="coerce")
-            elif is_bool_dtype(dtype.categories):
+            elif is_bool_dtype(dtype.categories.dtype):
                 if true_values is None:
                     true_values = ["True", "TRUE", "true"]
 
@@ -708,7 +708,7 @@ def from_codes(
             codes = codes.to_numpy(dtype=np.int64)
         else:
             codes = np.asarray(codes)
-        if len(codes) and not is_integer_dtype(codes):
+        if len(codes) and codes.dtype.kind not in "iu":
             raise ValueError("codes need to be array-like integers")
 
         if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -85,7 +85,6 @@
     is_all_strings,
     is_datetime64_any_dtype,
     is_dtype_equal,
-    is_float_dtype,
     is_integer_dtype,
     is_list_like,
     is_object_dtype,
@@ -460,7 +459,7 @@ def astype(self, dtype, copy: bool = True):
             return super().astype(dtype, copy=copy)
         elif is_string_dtype(dtype):
             return self._format_native_types()
-        elif is_integer_dtype(dtype):
+        elif dtype.kind in "iu":
             # we deliberately ignore int32 vs. int64 here.
             # See https://github.com/pandas-dev/pandas/issues/24381 for more.
             values = self.asi8
@@ -473,7 +472,7 @@ def astype(self, dtype, copy: bool = True):
             if copy:
                 values = values.copy()
             return values
-        elif (dtype.kind in "mM" and self.dtype != dtype) or is_float_dtype(dtype):
+        elif (dtype.kind in "mM" and self.dtype != dtype) or dtype.kind == "f":
             # disallow conversion between datetime/timedelta,
             # and conversions for any datetimelike to float
             msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -51,7 +51,6 @@
     INT64_DTYPE,
     is_bool_dtype,
     is_datetime64_any_dtype,
-    is_datetime64_dtype,
     is_dtype_equal,
     is_float_dtype,
     is_object_dtype,
@@ -2190,11 +2189,11 @@ def objects_to_datetime64ns(
         #  is in UTC
         # Return i8 values to denote unix timestamps
         return result.view("i8"), tz_parsed
-    elif is_datetime64_dtype(result):
+    elif result.dtype.kind == "M":
         # returning M8[ns] denotes wall-times; since tz is None
         #  the distinction is a thin one
         return result, tz_parsed
-    elif is_object_dtype(result):
+    elif result.dtype == object:
         # GH#23675 when called via `pd.to_datetime`, returning an object-dtype
         #  array is allowed.  When called via `pd.DatetimeIndex`, we can
         #  only accept datetime64 dtype, so raise TypeError if object-dtype

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -329,9 +329,9 @@ def _ensure_simple_new_inputs(
                 raise ValueError("closed keyword does not match dtype.closed")
 
         # coerce dtypes to match if needed
-        if is_float_dtype(left) and is_integer_dtype(right):
+        if is_float_dtype(left.dtype) and is_integer_dtype(right.dtype):
             right = right.astype(left.dtype)
-        elif is_float_dtype(right) and is_integer_dtype(left):
+        elif is_float_dtype(right.dtype) and is_integer_dtype(left.dtype):
             left = left.astype(right.dtype)
 
         if type(left) != type(right):
@@ -1778,6 +1778,6 @@ def _maybe_convert_platform_interval(values) -> ArrayLike:
 
     if not hasattr(values, "dtype"):
         values = np.asarray(values)
-        if is_integer_dtype(values) and values.dtype != np.int64:
+        if values.dtype.kind in "iu" and values.dtype != np.int64:
             values = values.astype(np.int64)
     return values
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -42,7 +42,6 @@
 from pandas.core.dtypes.common import (
     is_bool,
     is_bool_dtype,
-    is_datetime64_dtype,
     is_dtype_equal,
     is_float_dtype,
     is_integer_dtype,
@@ -478,18 +477,18 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
         na_value: float | np.datetime64 | lib.NoDefault
 
         # coerce
-        if is_float_dtype(dtype):
+        if dtype.kind == "f":
             # In astype, we consider dtype=float to also mean na_value=np.nan
             na_value = np.nan
-        elif is_datetime64_dtype(dtype):
+        elif dtype.kind == "M":
             na_value = np.datetime64("NaT")
         else:
             na_value = lib.no_default
 
         # to_numpy will also raise, but we get somewhat nicer exception messages here
-        if is_integer_dtype(dtype) and self._hasna:
+        if dtype.kind in "iu" and self._hasna:
             raise ValueError("cannot convert NA to integer")
-        if is_bool_dtype(dtype) and self._hasna:
+        if dtype.kind == "b" and self._hasna:
             # careful: astype_nansafe converts np.nan to True
             raise ValueError("cannot convert float NaN to bool")
 
@@ -789,10 +788,8 @@ def _maybe_mask_result(self, result, mask):
 
             return BooleanArray(result, mask, copy=False)
 
-        elif (
-            isinstance(result.dtype, np.dtype)
-            and result.dtype.kind == "m"
-            and is_supported_unit(get_unit_from_dtype(result.dtype))
+        elif lib.is_np_dtype(result.dtype, "m") and is_supported_unit(
+            get_unit_from_dtype(result.dtype)
         ):
             # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
             from pandas.core.arrays import TimedeltaArray

diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py
@@ -18,10 +18,7 @@
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.common import (
-    is_bool_dtype,
-    is_float_dtype,
     is_integer_dtype,
-    is_object_dtype,
     is_string_dtype,
     pandas_dtype,
 )
@@ -171,24 +168,24 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
     original = values
     values = np.array(values, copy=copy)
     inferred_type = None
-    if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
+    if values.dtype == object or is_string_dtype(values.dtype):
         inferred_type = lib.infer_dtype(values, skipna=True)
         if inferred_type == "boolean" and dtype is None:
             name = dtype_cls.__name__.strip("_")
             raise TypeError(f"{values.dtype} cannot be converted to {name}")
 
-    elif is_bool_dtype(values) and checker(dtype):
+    elif values.dtype.kind == "b" and checker(dtype):
         values = np.array(values, dtype=default_dtype, copy=copy)
 
-    elif not (is_integer_dtype(values) or is_float_dtype(values)):
+    elif values.dtype.kind not in "iuf":
         name = dtype_cls.__name__.strip("_")
         raise TypeError(f"{values.dtype} cannot be converted to {name}")
 
     if values.ndim != 1:
         raise TypeError("values must be a 1D list-like")
 
     if mask is None:
-        if is_integer_dtype(values):
+        if values.dtype.kind in "iu":
             # fastpath
             mask = np.zeros(len(values), dtype=np.bool_)
         else:
@@ -205,7 +202,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
     else:
         dtype = dtype.type
 
-    if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
+    if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
         if mask.all():
             values = np.ones(values.shape, dtype=dtype)
         else:

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -56,10 +56,7 @@
 from pandas.core.dtypes.common import (
     ensure_object,
     is_datetime64_any_dtype,
-    is_datetime64_dtype,
     is_dtype_equal,
-    is_float_dtype,
-    is_integer_dtype,
     is_period_dtype,
     pandas_dtype,
 )
@@ -915,7 +912,7 @@ def period_array(
     """
     data_dtype = getattr(data, "dtype", None)
 
-    if is_datetime64_dtype(data_dtype):
+    if lib.is_np_dtype(data_dtype, "M"):
         return PeriodArray._from_datetime64(data, freq)
     if isinstance(data_dtype, PeriodDtype):
         out = PeriodArray(data)
@@ -937,10 +934,10 @@ def period_array(
     else:
         dtype = None
 
-    if is_float_dtype(arrdata) and len(arrdata) > 0:
+    if arrdata.dtype.kind == "f" and len(arrdata) > 0:
         raise TypeError("PeriodIndex does not allow floating point in construction")
 
-    if is_integer_dtype(arrdata.dtype):
+    if arrdata.dtype.kind in "iu":
         arr = arrdata.astype(np.int64, copy=False)
         # error: Argument 2 to "from_ordinals" has incompatible type "Union[str,
         # Tick, None]"; expected "Union[timedelta, BaseOffset, str]"

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -925,7 +925,7 @@ def sequence_to_td64ns(
     elif is_float_dtype(data.dtype):
         # cast the unit, multiply base/frac separately
         # to avoid precision issues from float -> int
-        if is_extension_array_dtype(data):
+        if is_extension_array_dtype(data.dtype):
             mask = data._mask
             data = data._data
         else:

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -45,7 +45,6 @@
     is_datetime64_ns_dtype,
     is_dtype_equal,
     is_extension_array_dtype,
-    is_integer_dtype,
     is_list_like,
     is_object_dtype,
     is_timedelta64_ns_dtype,
@@ -749,7 +748,7 @@ def _try_cast(
     """
     is_ndarray = isinstance(arr, np.ndarray)
 
-    if is_object_dtype(dtype):
+    if dtype == object:
         if not is_ndarray:
             subarr = construct_1d_object_array_from_listlike(arr)
             return subarr
@@ -773,7 +772,7 @@ def _try_cast(
 
     # GH#15832: Check if we are requesting a numeric dtype and
     # that we can convert the data to the requested dtype.
-    elif is_integer_dtype(dtype):
+    elif dtype.kind in "iu":
         # this will raise if we have e.g. floats
 
         subarr = maybe_cast_to_integer_array(arr, dtype)

diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
@@ -19,7 +19,6 @@
 
 from pandas.core.dtypes.common import (
     is_dtype_equal,
-    is_integer_dtype,
     is_object_dtype,
     is_string_dtype,
     pandas_dtype,
@@ -99,10 +98,10 @@ def _astype_nansafe(
             arr, skipna=skipna, convert_na_value=False
         ).reshape(shape)
 
-    elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype):
+    elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu":
         return _astype_float_to_int_nansafe(arr, dtype, copy)
 
-    elif is_object_dtype(arr.dtype):
+    elif arr.dtype == object:
         # if we have a datetime/timedelta array of objects
         # then coerce to datetime64[ns] and use DatetimeArray.astype
 
@@ -131,7 +130,7 @@ def _astype_nansafe(
         )
         raise ValueError(msg)
 
-    if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
+    if copy or arr.dtype == object or dtype == object:
         # Explicit copy, or required since NumPy can't view from / to object.
         return arr.astype(dtype, copy=True)