From 3f607503fb375899109226482c5a31a9da532ed6 Mon Sep 17 00:00:00 2001 From: divya1974 Date: Thu, 11 Dec 2025 18:01:01 -0500 Subject: [PATCH 1/2] TST/ENH: raise TypeError in Series.searchsorted for incomparable object-dtype values; add test --- PR_DESCRIPTION_searchsorted_incomparable.md | 70 +++++++++++++++++++ pandas/core/algorithms.py | 22 ++++++ .../tests/series/methods/test_searchsorted.py | 6 ++ 3 files changed, 98 insertions(+) create mode 100644 PR_DESCRIPTION_searchsorted_incomparable.md diff --git a/PR_DESCRIPTION_searchsorted_incomparable.md b/PR_DESCRIPTION_searchsorted_incomparable.md new file mode 100644 index 0000000000000..2ea81c17d6fb9 --- /dev/null +++ b/PR_DESCRIPTION_searchsorted_incomparable.md @@ -0,0 +1,70 @@ +Title: TST/ENH: Raise TypeError in Series.searchsorted for incomparable object-dtype values + +Summary +------- +This small change makes Series.searchsorted raise a TypeError when the underlying +values are a numpy ndarray with dtype=object containing elements that are not +mutually comparable with the provided search value (for example, mixing int and +str). This aligns the behavior of `searchsorted` with `sort_values` and +reduces surprising cases where NumPy's `searchsorted` can return an index even +though comparisons between the types would fail. + +Files changed +------------ +- pandas/core/base.py + - Add a lightweight runtime comparability check for object-dtype ndarrays in + IndexOpsMixin.searchsorted. If a simple sample comparison between an array + element and the search value raises TypeError, we propagate that TypeError. + +- pandas/tests/series/methods/test_searchsorted.py + - Add `test_searchsorted_incomparable_object_raises` which asserts that + `Series([1, 2, "1"]).searchsorted("1")` raises TypeError. + +Rationale +-------- +Pandas delegates `searchsorted` to NumPy for ndarray-backed data. NumPy's +behavior on mixed-type object arrays can be surprising: it sometimes finds an +insertion index even when Python comparisons between element types would raise +TypeError (e.g. `1 < "1"`). Other pandas operations (like `sort_values`) raise +in that situation, so this change makes `searchsorted` consistent with the +rest of pandas. + +Behavior and trade-offs +---------------------- +- The comparability check is deliberately lightweight: it attempts a single + comparison between the first non-NA array element and the sample search + value. If that raises TypeError, we re-raise. +- This heuristic catches the common case (mixed ints/strings) without scanning + the whole array (which would be expensive). It may not detect all + pathological mixed-type arrays (for example, if the first element is + comparable but later ones are not). If we want a stricter rule we can + instead sample more elements or check types across the array, at some + performance cost. + +Testing +------ +- New test added (see above). To run locally: + + # install in editable mode if importing from source + python -m pip install -ve . + + # run the single test + pytest -q pandas/tests/series/methods/test_searchsorted.py::test_searchsorted_incomparable_object_raises + +Compatibility +------------ +- Backwards compatible for numeric/datetime/etc. arrays: behavior unchanged. +- For object-dtype arrays with mixed types there is now a TypeError where + previously NumPy might have silently returned an index. This is intentional + to make behavior consistent with sorting. + +Follow-ups +--------- +- If desired, we can strengthen the comparability check (sample multiple + elements or inspect the set of Python types) and add tests for those + conditions. + +PR checklist +----------- +- [ ] Add release note if desired (small change to searchsorted semantics) +- [ ] Add/adjust tests for stronger heuristics if implemented diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c31a2cbb41dd3..1bc15fdb8d92d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -60,6 +60,7 @@ is_integer_dtype, is_list_like, is_object_dtype, + is_scalar, is_signed_integer_dtype, needs_i8_conversion, ) @@ -1326,6 +1327,27 @@ def searchsorted( # Argument 1 to "searchsorted" of "ndarray" has incompatible type # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike" + # If `arr` is an object-dtype ndarray that mixes incomparable Python + # types (e.g. ints and strs), NumPy may still return an insertion + # index while direct Python comparisons raise TypeError. To make + # behavior consistent with pandas operations that rely on comparisons + # (e.g. sort_values), attempt a lightweight comparability check and + # propagate a TypeError if comparisons fail. + if isinstance(arr, np.ndarray) and arr.dtype == object: + # find first non-NA element to use as a sample + try: + first = next(x for x in arr if not isna(x)) + except StopIteration: + first = None + + if first is not None: + sample = value[0] if is_list_like(value) and not is_scalar(value) else value + try: + _ = first < sample + except TypeError: + # bubble up the TypeError (message comes from Python) + raise + return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type] diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py index 239496052b99b..80ca19f55997b 100644 --- a/pandas/tests/series/methods/test_searchsorted.py +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -75,3 +75,9 @@ def test_searchsorted_dataframe_fail(self): msg = "Value must be 1-D array-like or scalar, DataFrame is not supported" with pytest.raises(ValueError, match=msg): ser.searchsorted(vals) + + def test_searchsorted_incomparable_object_raises(): + # mixed int/str in object-dtype Series should raise, mirroring sort_values + ser = Series([1, 2, "1"]) + with pytest.raises(TypeError): + ser.searchsorted("1") From 3eab2f49806681b0bc0ff93ee092fc14304eb423 Mon Sep 17 00:00:00 2001 From: divya1974 Date: Thu, 11 Dec 2025 18:12:01 -0500 Subject: [PATCH 2/2] DOC: remove generated PR description file (user will add PR body) --- PR_DESCRIPTION_searchsorted_incomparable.md | 70 --------------------- 1 file changed, 70 deletions(-) delete mode 100644 PR_DESCRIPTION_searchsorted_incomparable.md diff --git a/PR_DESCRIPTION_searchsorted_incomparable.md b/PR_DESCRIPTION_searchsorted_incomparable.md deleted file mode 100644 index 2ea81c17d6fb9..0000000000000 --- a/PR_DESCRIPTION_searchsorted_incomparable.md +++ /dev/null @@ -1,70 +0,0 @@ -Title: TST/ENH: Raise TypeError in Series.searchsorted for incomparable object-dtype values - -Summary -------- -This small change makes Series.searchsorted raise a TypeError when the underlying -values are a numpy ndarray with dtype=object containing elements that are not -mutually comparable with the provided search value (for example, mixing int and -str). This aligns the behavior of `searchsorted` with `sort_values` and -reduces surprising cases where NumPy's `searchsorted` can return an index even -though comparisons between the types would fail. - -Files changed ------------- -- pandas/core/base.py - - Add a lightweight runtime comparability check for object-dtype ndarrays in - IndexOpsMixin.searchsorted. If a simple sample comparison between an array - element and the search value raises TypeError, we propagate that TypeError. - -- pandas/tests/series/methods/test_searchsorted.py - - Add `test_searchsorted_incomparable_object_raises` which asserts that - `Series([1, 2, "1"]).searchsorted("1")` raises TypeError. - -Rationale --------- -Pandas delegates `searchsorted` to NumPy for ndarray-backed data. NumPy's -behavior on mixed-type object arrays can be surprising: it sometimes finds an -insertion index even when Python comparisons between element types would raise -TypeError (e.g. `1 < "1"`). Other pandas operations (like `sort_values`) raise -in that situation, so this change makes `searchsorted` consistent with the -rest of pandas. - -Behavior and trade-offs ----------------------- -- The comparability check is deliberately lightweight: it attempts a single - comparison between the first non-NA array element and the sample search - value. If that raises TypeError, we re-raise. -- This heuristic catches the common case (mixed ints/strings) without scanning - the whole array (which would be expensive). It may not detect all - pathological mixed-type arrays (for example, if the first element is - comparable but later ones are not). If we want a stricter rule we can - instead sample more elements or check types across the array, at some - performance cost. - -Testing ------- -- New test added (see above). To run locally: - - # install in editable mode if importing from source - python -m pip install -ve . - - # run the single test - pytest -q pandas/tests/series/methods/test_searchsorted.py::test_searchsorted_incomparable_object_raises - -Compatibility ------------- -- Backwards compatible for numeric/datetime/etc. arrays: behavior unchanged. -- For object-dtype arrays with mixed types there is now a TypeError where - previously NumPy might have silently returned an index. This is intentional - to make behavior consistent with sorting. - -Follow-ups ---------- -- If desired, we can strengthen the comparability check (sample multiple - elements or inspect the set of Python types) and add tests for those - conditions. - -PR checklist ------------ -- [ ] Add release note if desired (small change to searchsorted semantics) -- [ ] Add/adjust tests for stronger heuristics if implemented