pandas-dev · mroeschke · Jun 24, 2022 · May 27, 2022 · May 27, 2022 · May 27, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -675,6 +675,7 @@ Other Deprecations
 - Deprecated the ``closed`` argument in :class:`IntervalArray` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
 - Deprecated the ``closed`` argument in :class:`intervaltree` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
 - Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
+- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.performance:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -14,7 +14,7 @@
     cast,
     final,
 )
-from warnings import warn
+import warnings
 
 import numpy as np
 
@@ -80,6 +80,7 @@
     na_value_for_dtype,
 )
 
+from pandas.core import common as com
 from pandas.core.array_algos.take import take_nd
 from pandas.core.construction import (
     array as pd_array,
@@ -580,7 +581,8 @@ def factorize_array(
 def factorize(
     values,
     sort: bool = False,
-    na_sentinel: int | None = -1,
+    na_sentinel: int | None | lib.NoDefault = lib.no_default,
+    use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     size_hint: int | None = None,
 ) -> tuple[np.ndarray, np.ndarray | Index]:
     """
@@ -598,7 +600,19 @@ def factorize(
         Value to mark "not found". If None, will not drop the NaN
         from the uniques of the values.
 
+        .. deprecated:: 1.5.0
+            The na_sentinel argument is deprecated and
+            will be removed in a future version of pandas. Specify use_na_sentinel as
-            The na_sentinel argument is deprecated and
-            will be removed in a future version of pandas. Specify use_na_sentinel as
+            The `na_sentinel` argument is deprecated and
+            will be removed in a future version of pandas. Specify `use_na_sentinel` as
-            The na_sentinel argument is deprecated and
-            will be removed in a future version of pandas. Specify use_na_sentinel as
+            The `na_sentinel` argument is deprecated and
+            will be removed in a future version of pandas. Specify `use_na_sentinel` as
+            either True or False.
+
         .. versionchanged:: 1.1.2
+
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NaN values. If False,
+        NaN values will be encoded as non-negative integers and will not drop the
+        NaN from the uniques of the values.
+
+        .. versionadded:: 1.5.0
     {size_hint}\
 
     Returns
@@ -646,8 +660,8 @@ def factorize(
     >>> uniques
     array(['a', 'b', 'c'], dtype=object)
 
-    Missing values are indicated in `codes` with `na_sentinel`
-    (``-1`` by default). Note that missing values are never
+    When ``use_na_sentinel=True`` (the default), missing values are indicated in
+    the `codes` with the sentinel value ``-1`` and missing values are not
     included in `uniques`.
 
     >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -682,16 +696,16 @@ def factorize(
     Index(['a', 'c'], dtype='object')
 
     If NaN is in the values, and we want to include NaN in the uniques of the
-    values, it can be achieved by setting ``na_sentinel=None``.
+    values, it can be achieved by setting ``use_na_sentinel=False``.
 
     >>> values = np.array([1, 2, 1, np.nan])
-    >>> codes, uniques = pd.factorize(values)  # default: na_sentinel=-1
+    >>> codes, uniques = pd.factorize(values)  # default: use_na_sentinel=True
     >>> codes
     array([ 0,  1,  0, -1])
     >>> uniques
     array([1., 2.])
 
-    >>> codes, uniques = pd.factorize(values, na_sentinel=None)
+    >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
     >>> codes
     array([0, 1, 0, 2])
     >>> uniques
@@ -706,6 +720,7 @@ def factorize(
     # responsible only for factorization. All data coercion, sorting and boxing
     # should happen here.
 
+    na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
     if isinstance(values, ABCRangeIndex):
         return values.factorize(sort=sort)
 
@@ -730,9 +745,12 @@ def factorize(
         codes, uniques = values.factorize(sort=sort)
         return _re_wrap_factorize(original, uniques, codes)
 
-    if not isinstance(values.dtype, np.dtype):
-        # i.e. ExtensionDtype
-        codes, uniques = values.factorize(na_sentinel=na_sentinel)
+    elif not isinstance(values.dtype, np.dtype):
+        with warnings.catch_warnings():
+            # We've already warned above
+            warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
+            codes, uniques = values.factorize(na_sentinel=na_sentinel)
+
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
         codes, uniques = factorize_array(
@@ -950,7 +968,7 @@ def mode(
     try:
         npresult = np.sort(npresult)
     except TypeError as err:
-        warn(f"Unable to sort modes: {err}")
+        warnings.warn(f"Unable to sort modes: {err}")
 
     result = _reconstruct_data(npresult, original.dtype, original)
     return result
@@ -1564,7 +1582,7 @@ def diff(arr, n: int, axis: int = 0):
                 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
             return op(arr, arr.shift(n))
         else:
-            warn(
+            warnings.warn(
                 "dtype lost in 'diff()'. In the future this will raise a "
                 "TypeError. Convert to a suitable dtype prior to calling 'diff'.",
                 FutureWarning,

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas._typing import (
     TakeIndexer,
     npt,
@@ -28,6 +29,7 @@
 )
 from pandas.core.dtypes.missing import isna
 
+from pandas.core import common as com
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import (
     check_array_indexer,
@@ -122,7 +124,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
             return type(self)(pc.drop_null(self._data))
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         encoded = self._data.dictionary_encode()
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -8,6 +8,7 @@
 """
 from __future__ import annotations
 
+import inspect
 import operator
 from typing import (
     TYPE_CHECKING,
@@ -20,6 +21,7 @@
     cast,
     overload,
 )
+import warnings
 
 import numpy as np
 
@@ -45,6 +47,7 @@
     cache_readonly,
     deprecate_nonkeyword_arguments,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_bool_kwarg,
     validate_fillna_kwargs,
@@ -68,6 +71,7 @@
 
 from pandas.core import (
     arraylike,
+    common as com,
     missing,
     roperator,
 )
@@ -456,6 +460,20 @@ def __ne__(self, other: Any) -> ArrayLike:  # type: ignore[override]
         """
         return ~(self == other)
 
+    def __init_subclass__(cls, **kwargs):
+        factorize = getattr(cls, "factorize")
+        if "use_na_sentinel" not in inspect.signature(factorize).parameters:
+            # See GH#46910 for details on the deprecation
+            name = cls.__name__
+            warnings.warn(
+                f"The na_sentinel argument of {name}.factorize is deprecated. "
+                f"In the future, pandas will use the use_na_sentinel argument instead. "
+                f"Add this argument to {name}.factorize to be compatible with future"
+                f"versions of pandas and silence this warning.",
+                DeprecationWarning,
+                stacklevel=find_stack_level(),
+            )
+
     def to_numpy(
         self,
         dtype: npt.DTypeLike | None = None,
@@ -1002,7 +1020,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         """
         return self.astype(object), np.nan
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
         """
         Encode the extension array as an enumerated type.
 
@@ -1011,6 +1033,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
 
+            .. deprecated:: 1.5.0
+                The na_sentinel argument is deprecated and
+                will be removed in a future version of pandas. Specify use_na_sentinel
+                as either True or False.
+
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NaN values. If False,
+            NaN values will be encoded as non-negative integers and will not drop the
+            NaN from the uniques of the values.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         codes : ndarray
@@ -1041,6 +1075,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         #    original ExtensionArray.
         # 2. ExtensionArray.factorize.
         #    Complete control over factorization.
+        resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1899,7 +1899,12 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel=-1, sort: bool = False):
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+        sort: bool = False,
+    ):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)
@@ -1909,7 +1914,9 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
                 uniques = uniques[::-1]
             return codes, uniques
         # FIXME: shouldn't get here; we are ignoring sort
-        return super().factorize(na_sentinel=na_sentinel)
+        return super().factorize(
+            na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
 
 
 # -------------------------------------------------------------------

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -59,6 +59,7 @@
 from pandas.core import (
     algorithms as algos,
     arraylike,
+    common as com,
     missing,
     nanops,
     ops,
@@ -869,7 +870,16 @@ def searchsorted(
         return self._data.searchsorted(value, side=side, sorter=sorter)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         arr = self._data
         mask = self._mask
 

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -848,13 +848,19 @@ def _values_for_factorize(self):
         # Still override this for hash_pandas_object
         return np.asarray(self), self.fill_value
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, SparseArray]:
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        codes, uniques = algos.factorize(
+            np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1136,8 +1136,15 @@ def _memory_usage(self, deep: bool = False) -> int:
             """
         ),
     )
-    def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
-        return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
+    def factorize(
+        self,
+        sort: bool = False,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ):
+        return algorithms.factorize(
+            self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
 
     _shared_docs[
         "searchsorted"