Skip to content

Commit

Permalink
PERF: dtype checks (#52682)
Browse files Browse the repository at this point in the history
* PERF: faster dtype checks

* PERF: dtype checks
  • Loading branch information
jbrockmendel committed Apr 17, 2023
1 parent 4017e9c commit 34b3e7f
Show file tree
Hide file tree
Showing 33 changed files with 109 additions and 155 deletions.
5 changes: 2 additions & 3 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
is_sequence,
is_signed_integer_dtype,
is_unsigned_integer_dtype,
Expand Down Expand Up @@ -389,11 +388,11 @@ def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index:
dtype = pandas_dtype(dtype)
assert isinstance(dtype, np.dtype)

if is_integer_dtype(dtype):
if dtype.kind in "iu":
values = np.arange(k, dtype=dtype)
if is_unsigned_integer_dtype(dtype):
values += 2 ** (dtype.itemsize * 8 - 1)
elif is_float_dtype(dtype):
elif dtype.kind == "f":
values = np.random.random_sample(k) - np.random.random_sample(1)
values.sort()
values = values * (10 ** np.random.randint(0, 9))
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
if (
len(comps_array) > 1_000_000
and len(values) <= 26
and not is_object_dtype(comps_array)
and comps_array.dtype != object
):
# If the values include nan we need to check for nan explicitly
# since np.nan it not equal to np.nan
Expand Down Expand Up @@ -766,7 +766,7 @@ def factorize(
else:
values = np.asarray(values) # convert DTA/TDA/MultiIndex

if not use_na_sentinel and is_object_dtype(values):
if not use_na_sentinel and values.dtype == object:
# factorize can now handle differentiating various types of null values.
# These can only occur when the array has object dtype.
# However, for backwards compatibility we only use the null for the
Expand Down Expand Up @@ -1317,7 +1317,7 @@ def searchsorted(

if (
isinstance(arr, np.ndarray)
and is_integer_dtype(arr.dtype)
and arr.dtype.kind in "iu"
and (is_integer(value) or is_integer_dtype(value))
):
# if `arr` and `value` have different dtypes, `arr` would be
Expand Down
12 changes: 3 additions & 9 deletions pandas/core/array_algos/masked_accumulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,6 @@

import numpy as np

from pandas.core.dtypes.common import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
)

if TYPE_CHECKING:
from pandas._typing import npt

Expand Down Expand Up @@ -46,11 +40,11 @@ def _cum_func(
Whether to skip NA.
"""
dtype_info: np.iinfo | np.finfo
if is_float_dtype(values):
if values.dtype.kind == "f":
dtype_info = np.finfo(values.dtype.type)
elif is_integer_dtype(values):
elif values.dtype.kind in "iu":
dtype_info = np.iinfo(values.dtype.type)
elif is_bool_dtype(values):
elif values.dtype.kind == "b":
# Max value of bool is 1, but since we are setting into a boolean
# array, 255 is fine as well. Min value has to be 0 when setting
# into the boolean array.
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
is_array_like,
is_bool_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_scalar,
Expand Down Expand Up @@ -363,9 +362,9 @@ def __getitem__(self, item: PositionalIndexer):
else:
pa_dtype = self._dtype.pyarrow_dtype
return type(self)(pa.chunked_array([], type=pa_dtype))
elif is_integer_dtype(item.dtype):
elif item.dtype.kind in "iu":
return self.take(item)
elif is_bool_dtype(item.dtype):
elif item.dtype.kind == "b":
return type(self)(self._pa_array.filter(item))
else:
raise IndexError(
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)

elif is_integer_dtype(dtype) and self.isna().any():
elif dtype.kind in "iu" and self.isna().any():
raise ValueError("Cannot convert float NaN to integer")

elif len(self.codes) == 0 or len(self.categories) == 0:
Expand Down Expand Up @@ -624,7 +624,7 @@ def _from_inferred_categories(
cats = to_datetime(inferred_categories, errors="coerce")
elif lib.is_np_dtype(dtype.categories.dtype, "m"):
cats = to_timedelta(inferred_categories, errors="coerce")
elif is_bool_dtype(dtype.categories):
elif is_bool_dtype(dtype.categories.dtype):
if true_values is None:
true_values = ["True", "TRUE", "true"]

Expand Down Expand Up @@ -708,7 +708,7 @@ def from_codes(
codes = codes.to_numpy(dtype=np.int64)
else:
codes = np.asarray(codes)
if len(codes) and not is_integer_dtype(codes):
if len(codes) and codes.dtype.kind not in "iu":
raise ValueError("codes need to be array-like integers")

if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@
is_all_strings,
is_datetime64_any_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_object_dtype,
Expand Down Expand Up @@ -460,7 +459,7 @@ def astype(self, dtype, copy: bool = True):
return super().astype(dtype, copy=copy)
elif is_string_dtype(dtype):
return self._format_native_types()
elif is_integer_dtype(dtype):
elif dtype.kind in "iu":
# we deliberately ignore int32 vs. int64 here.
# See https://github.com/pandas-dev/pandas/issues/24381 for more.
values = self.asi8
Expand All @@ -473,7 +472,7 @@ def astype(self, dtype, copy: bool = True):
if copy:
values = values.copy()
return values
elif (dtype.kind in "mM" and self.dtype != dtype) or is_float_dtype(dtype):
elif (dtype.kind in "mM" and self.dtype != dtype) or dtype.kind == "f":
# disallow conversion between datetime/timedelta,
# and conversions for any datetimelike to float
msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
INT64_DTYPE,
is_bool_dtype,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_dtype_equal,
is_float_dtype,
is_object_dtype,
Expand Down Expand Up @@ -2190,11 +2189,11 @@ def objects_to_datetime64ns(
# is in UTC
# Return i8 values to denote unix timestamps
return result.view("i8"), tz_parsed
elif is_datetime64_dtype(result):
elif result.dtype.kind == "M":
# returning M8[ns] denotes wall-times; since tz is None
# the distinction is a thin one
return result, tz_parsed
elif is_object_dtype(result):
elif result.dtype == object:
# GH#23675 when called via `pd.to_datetime`, returning an object-dtype
# array is allowed. When called via `pd.DatetimeIndex`, we can
# only accept datetime64 dtype, so raise TypeError if object-dtype
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,9 +329,9 @@ def _ensure_simple_new_inputs(
raise ValueError("closed keyword does not match dtype.closed")

# coerce dtypes to match if needed
if is_float_dtype(left) and is_integer_dtype(right):
if is_float_dtype(left.dtype) and is_integer_dtype(right.dtype):
right = right.astype(left.dtype)
elif is_float_dtype(right) and is_integer_dtype(left):
elif is_float_dtype(right.dtype) and is_integer_dtype(left.dtype):
left = left.astype(right.dtype)

if type(left) != type(right):
Expand Down Expand Up @@ -1778,6 +1778,6 @@ def _maybe_convert_platform_interval(values) -> ArrayLike:

if not hasattr(values, "dtype"):
values = np.asarray(values)
if is_integer_dtype(values) and values.dtype != np.int64:
if values.dtype.kind in "iu" and values.dtype != np.int64:
values = values.astype(np.int64)
return values
15 changes: 6 additions & 9 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from pandas.core.dtypes.common import (
is_bool,
is_bool_dtype,
is_datetime64_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -478,18 +477,18 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
na_value: float | np.datetime64 | lib.NoDefault

# coerce
if is_float_dtype(dtype):
if dtype.kind == "f":
# In astype, we consider dtype=float to also mean na_value=np.nan
na_value = np.nan
elif is_datetime64_dtype(dtype):
elif dtype.kind == "M":
na_value = np.datetime64("NaT")
else:
na_value = lib.no_default

# to_numpy will also raise, but we get somewhat nicer exception messages here
if is_integer_dtype(dtype) and self._hasna:
if dtype.kind in "iu" and self._hasna:
raise ValueError("cannot convert NA to integer")
if is_bool_dtype(dtype) and self._hasna:
if dtype.kind == "b" and self._hasna:
# careful: astype_nansafe converts np.nan to True
raise ValueError("cannot convert float NaN to bool")

Expand Down Expand Up @@ -789,10 +788,8 @@ def _maybe_mask_result(self, result, mask):

return BooleanArray(result, mask, copy=False)

elif (
isinstance(result.dtype, np.dtype)
and result.dtype.kind == "m"
and is_supported_unit(get_unit_from_dtype(result.dtype))
elif lib.is_np_dtype(result.dtype, "m") and is_supported_unit(
get_unit_from_dtype(result.dtype)
):
# e.g. test_numeric_arr_mul_tdscalar_numexpr_path
from pandas.core.arrays import TimedeltaArray
Expand Down
13 changes: 5 additions & 8 deletions pandas/core/arrays/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
pandas_dtype,
)
Expand Down Expand Up @@ -171,24 +168,24 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
original = values
values = np.array(values, copy=copy)
inferred_type = None
if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
if values.dtype == object or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "boolean" and dtype is None:
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")

elif is_bool_dtype(values) and checker(dtype):
elif values.dtype.kind == "b" and checker(dtype):
values = np.array(values, dtype=default_dtype, copy=copy)

elif not (is_integer_dtype(values) or is_float_dtype(values)):
elif values.dtype.kind not in "iuf":
name = dtype_cls.__name__.strip("_")
raise TypeError(f"{values.dtype} cannot be converted to {name}")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
if is_integer_dtype(values):
if values.dtype.kind in "iu":
# fastpath
mask = np.zeros(len(values), dtype=np.bool_)
else:
Expand All @@ -205,7 +202,7 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
else:
dtype = dtype.type

if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0:
if mask.all():
values = np.ones(values.shape, dtype=dtype)
else:
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@
from pandas.core.dtypes.common import (
ensure_object,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
is_period_dtype,
pandas_dtype,
)
Expand Down Expand Up @@ -915,7 +912,7 @@ def period_array(
"""
data_dtype = getattr(data, "dtype", None)

if is_datetime64_dtype(data_dtype):
if lib.is_np_dtype(data_dtype, "M"):
return PeriodArray._from_datetime64(data, freq)
if isinstance(data_dtype, PeriodDtype):
out = PeriodArray(data)
Expand All @@ -937,10 +934,10 @@ def period_array(
else:
dtype = None

if is_float_dtype(arrdata) and len(arrdata) > 0:
if arrdata.dtype.kind == "f" and len(arrdata) > 0:
raise TypeError("PeriodIndex does not allow floating point in construction")

if is_integer_dtype(arrdata.dtype):
if arrdata.dtype.kind in "iu":
arr = arrdata.astype(np.int64, copy=False)
# error: Argument 2 to "from_ordinals" has incompatible type "Union[str,
# Tick, None]"; expected "Union[timedelta, BaseOffset, str]"
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,7 +925,7 @@ def sequence_to_td64ns(
elif is_float_dtype(data.dtype):
# cast the unit, multiply base/frac separately
# to avoid precision issues from float -> int
if is_extension_array_dtype(data):
if is_extension_array_dtype(data.dtype):
mask = data._mask
data = data._data
else:
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
is_datetime64_ns_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_integer_dtype,
is_list_like,
is_object_dtype,
is_timedelta64_ns_dtype,
Expand Down Expand Up @@ -749,7 +748,7 @@ def _try_cast(
"""
is_ndarray = isinstance(arr, np.ndarray)

if is_object_dtype(dtype):
if dtype == object:
if not is_ndarray:
subarr = construct_1d_object_array_from_listlike(arr)
return subarr
Expand All @@ -773,7 +772,7 @@ def _try_cast(

# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
elif is_integer_dtype(dtype):
elif dtype.kind in "iu":
# this will raise if we have e.g. floats

subarr = maybe_cast_to_integer_array(arr, dtype)
Expand Down
7 changes: 3 additions & 4 deletions pandas/core/dtypes/astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

from pandas.core.dtypes.common import (
is_dtype_equal,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
pandas_dtype,
Expand Down Expand Up @@ -99,10 +98,10 @@ def _astype_nansafe(
arr, skipna=skipna, convert_na_value=False
).reshape(shape)

elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype):
elif np.issubdtype(arr.dtype, np.floating) and dtype.kind in "iu":
return _astype_float_to_int_nansafe(arr, dtype, copy)

elif is_object_dtype(arr.dtype):
elif arr.dtype == object:
# if we have a datetime/timedelta array of objects
# then coerce to datetime64[ns] and use DatetimeArray.astype

Expand Down Expand Up @@ -131,7 +130,7 @@ def _astype_nansafe(
)
raise ValueError(msg)

if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
if copy or arr.dtype == object or dtype == object:
# Explicit copy, or required since NumPy can't view from / to object.
return arr.astype(dtype, copy=True)

Expand Down

0 comments on commit 34b3e7f

Please sign in to comment.