Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLN: Simplify map_infer_mask #58483

Merged
merged 5 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,14 @@ def time_searchsorted(self, dtype):


class Map:
params = (["dict", "Series", "lambda"], ["object", "category", "int"])
param_names = "mapper"

def setup(self, mapper, dtype):
params = (
["dict", "Series", "lambda"],
["object", "category", "int"],
[None, "ignore"],
)
param_names = ["mapper", "dtype", "na_action"]

def setup(self, mapper, dtype, na_action):
map_size = 1000
map_data = Series(map_size - np.arange(map_size), dtype=dtype)

Expand All @@ -168,8 +172,8 @@ def setup(self, mapper, dtype):

self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype)

def time_map(self, mapper, *args, **kwargs):
self.s.map(self.map_data)
def time_map(self, mapper, dtype, na_action):
self.s.map(self.map_data, na_action=na_action)


class Clip:
Expand Down
5 changes: 0 additions & 5 deletions pandas/_libs/dtypes.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,3 @@ ctypedef fused numeric_t:
ctypedef fused numeric_object_t:
numeric_t
object

ctypedef fused uint8_int64_object_t:
uint8_t
int64_t
object
70 changes: 29 additions & 41 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ from numpy cimport (
PyArray_ITER_DATA,
PyArray_ITER_NEXT,
PyArray_IterNew,
PyArray_SETITEM,
complex128_t,
flatiter,
float64_t,
Expand All @@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h":
PandasParser_IMPORT

from pandas._libs cimport util
from pandas._libs.dtypes cimport uint8_int64_object_t
from pandas._libs.util cimport (
INT64_MAX,
INT64_MIN,
Expand Down Expand Up @@ -2845,14 +2845,16 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value.
NoDefault = Literal[_NoDefault.no_default]


@cython.boundscheck(False)
@cython.wraparound(False)
def map_infer_mask(
ndarray[object] arr,
object f,
const uint8_t[:] mask,
*,
bint convert=True,
object na_value=no_default,
cnp.dtype dtype=np.dtype(object)
ndarray arr,
object f,
const uint8_t[:] mask,
*,
bint convert=True,
object na_value=no_default,
cnp.dtype dtype=np.dtype(object)
) -> "ArrayLike":
"""
Substitute for np.vectorize with pandas-friendly dtype inference.
Expand All @@ -2875,53 +2877,39 @@ def map_infer_mask(
-------
np.ndarray or an ExtensionArray
"""
cdef Py_ssize_t n = len(arr)
result = np.empty(n, dtype=dtype)

_map_infer_mask(
result,
arr,
f,
mask,
na_value,
)
if convert:
return maybe_convert_objects(result)
else:
return result


@cython.boundscheck(False)
@cython.wraparound(False)
def _map_infer_mask(
ndarray[uint8_int64_object_t] out,
ndarray[object] arr,
object f,
const uint8_t[:] mask,
object na_value=no_default,
) -> None:
"""
Helper for map_infer_mask, split off to use fused types based on the result.
"""
cdef:
Py_ssize_t i, n
Py_ssize_t i
Py_ssize_t n = len(arr)
object val

n = len(arr)
ndarray result = np.empty(n, dtype=dtype)

flatiter arr_it = PyArray_IterNew(arr)
flatiter result_it = PyArray_IterNew(result)

for i in range(n):
if mask[i]:
if na_value is no_default:
val = arr[i]
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
WillAyd marked this conversation as resolved.
Show resolved Hide resolved
else:
val = na_value
else:
val = f(arr[i])
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
val = f(val)

if cnp.PyArray_IsZeroDim(val):
# unbox 0-dim arrays, GH#690
val = val.item()

out[i] = val
PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val)

PyArray_ITER_NEXT(arr_it)
PyArray_ITER_NEXT(result_it)

if convert:
return maybe_convert_objects(result)
else:
return result


@cython.boundscheck(False)
Expand Down
41 changes: 19 additions & 22 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,28 +627,25 @@ def _str_map(
na_value = np.nan
else:
na_value = False
try:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@phofl

If you're free, could you review the changes to the Arrow strings here?

I think what I have here is correct, but not too familiar with the arrow strings.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke

Are you able to review this bit instead?

I'd like to get this in soon.

result = lib.map_infer_mask(
arr,
f,
mask.view("uint8"),
convert=False,
na_value=na_value,
dtype=np.dtype(cast(type, dtype)),
)
return result

except ValueError:
result = lib.map_infer_mask(
arr,
f,
mask.view("uint8"),
convert=False,
na_value=na_value,
)
if convert and result.dtype == object:
result = lib.maybe_convert_objects(result)
return result

dtype = np.dtype(cast(type, dtype))
if mask.any():
# numpy int/bool dtypes cannot hold NaNs so we must convert to
# float64 for int (to match maybe_convert_objects) or
# object for bool (again to match maybe_convert_objects)
if is_integer_dtype(dtype):
dtype = np.dtype("float64")
else:
dtype = np.dtype(object)
result = lib.map_infer_mask(
arr,
f,
mask.view("uint8"),
convert=False,
na_value=na_value,
dtype=dtype,
)
return result

elif is_string_dtype(dtype) and not is_object_dtype(dtype):
# i.e. StringDtype
Expand Down