pandas-dev · mroeschke · Jun 24, 2022 · May 27, 2022 · May 27, 2022 · May 27, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -675,6 +675,7 @@ Other Deprecations
 - Deprecated the ``closed`` argument in :class:`IntervalArray` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
 - Deprecated the ``closed`` argument in :class:`intervaltree` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
 - Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
+- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.performance:

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -80,6 +80,7 @@
     na_value_for_dtype,
 )
 
+from pandas.core import common as com
 from pandas.core.array_algos.take import take_nd
 from pandas.core.construction import (
     array as pd_array,
@@ -580,7 +581,8 @@ def factorize_array(
 def factorize(
     values,
     sort: bool = False,
-    na_sentinel: int | None = -1,
+    na_sentinel: int | None | lib.NoDefault = lib.no_default,
+    use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     size_hint: int | None = None,
 ) -> tuple[np.ndarray, np.ndarray | Index]:
     """
@@ -598,7 +600,19 @@ def factorize(
         Value to mark "not found". If None, will not drop the NaN
         from the uniques of the values.
 
+        .. deprecated:: 1.5.0
+            The na_sentinel argument is deprecated and
+            will be removed in a future version of pandas. Specify use_na_sentinel as
-            The na_sentinel argument is deprecated and
-            will be removed in a future version of pandas. Specify use_na_sentinel as
+            The `na_sentinel` argument is deprecated and
+            will be removed in a future version of pandas. Specify `use_na_sentinel` as
-            The na_sentinel argument is deprecated and
-            will be removed in a future version of pandas. Specify use_na_sentinel as
+            The `na_sentinel` argument is deprecated and
+            will be removed in a future version of pandas. Specify `use_na_sentinel` as
+            either True or False.
+
         .. versionchanged:: 1.1.2
+
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NaN values. If False,
+        NaN values will be encoded as non-negative integers and will not drop the
+        NaN from the uniques of the values.
+
+        .. versionadded:: 1.5.0
     {size_hint}\
 
     Returns
@@ -646,8 +660,8 @@ def factorize(
     >>> uniques
     array(['a', 'b', 'c'], dtype=object)
 
-    Missing values are indicated in `codes` with `na_sentinel`
-    (``-1`` by default). Note that missing values are never
+    When ``use_na_sentinel=True`` (the default), missing values are indicated in
+    the `codes` with the sentinel value ``-1`` and missing values are not
     included in `uniques`.
 
     >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -682,16 +696,16 @@ def factorize(
     Index(['a', 'c'], dtype='object')
 
     If NaN is in the values, and we want to include NaN in the uniques of the
-    values, it can be achieved by setting ``na_sentinel=None``.
+    values, it can be achieved by setting ``use_na_sentinel=False``.
 
     >>> values = np.array([1, 2, 1, np.nan])
-    >>> codes, uniques = pd.factorize(values)  # default: na_sentinel=-1
+    >>> codes, uniques = pd.factorize(values)  # default: use_na_sentinel=True
     >>> codes
     array([ 0,  1,  0, -1])
     >>> uniques
     array([1., 2.])
 
-    >>> codes, uniques = pd.factorize(values, na_sentinel=None)
+    >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
     >>> codes
     array([0, 1, 0, 2])
     >>> uniques
@@ -706,7 +720,13 @@ def factorize(
     # responsible only for factorization. All data coercion, sorting and boxing
     # should happen here.
 
+    # Can't always warn here because EA's factorize will warn too; warn for each
+    # path below.
+    passed_na_sentinel = na_sentinel
+    na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel, warn=False)
     if isinstance(values, ABCRangeIndex):
+        # Emit warning if appropriate
+        _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel)
         return values.factorize(sort=sort)
 
     values = _ensure_arraylike(values)
@@ -725,15 +745,30 @@ def factorize(
         isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
         and values.freq is not None
     ):
+        # Emit warning if appropriate
+        _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel)
         # The presence of 'freq' means we can fast-path sorting and know there
         #  aren't NAs
         codes, uniques = values.factorize(sort=sort)
         return _re_wrap_factorize(original, uniques, codes)
 
-    if not isinstance(values.dtype, np.dtype):
+    elif not isinstance(values.dtype, np.dtype):
         # i.e. ExtensionDtype
-        codes, uniques = values.factorize(na_sentinel=na_sentinel)
+        if passed_na_sentinel is lib.no_default:
+            # User didn't specify na_sentinel; avoid warning. Note EA path always
+            # uses a na_sentinel value.
+            codes, uniques = values.factorize(use_na_sentinel=True)
+        elif passed_na_sentinel is None:
+            # Emit the appropriate warning message for None
+            _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel)
+            codes, uniques = values.factorize(use_na_sentinel=True)
+        else:
+            # EA.factorize will warn
+            codes, uniques = values.factorize(na_sentinel=na_sentinel)
+
     else:
+        # Generate warning for na_sentile if appropriate
+        _ = com.resolve_na_sentinel(passed_na_sentinel, use_na_sentinel)
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
         codes, uniques = factorize_array(
             values, na_sentinel=na_sentinel, size_hint=size_hint

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas._typing import (
     TakeIndexer,
     npt,
@@ -28,6 +29,7 @@
 )
 from pandas.core.dtypes.missing import isna
 
+from pandas.core import common as com
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import (
     check_array_indexer,
@@ -122,7 +124,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
             return type(self)(pc.drop_null(self._data))
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         encoded = self._data.dictionary_encode()
         indices = pa.chunked_array(
             [c.indices for c in encoded.chunks], type=encoded.type.index_type

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -68,6 +68,7 @@
 
 from pandas.core import (
     arraylike,
+    common as com,
     missing,
     roperator,
 )
@@ -1002,7 +1003,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
         """
         return self.astype(object), np.nan
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
         """
         Encode the extension array as an enumerated type.
 
@@ -1011,6 +1016,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         na_sentinel : int, default -1
             Value to use in the `codes` array to indicate missing values.
 
+            .. deprecated:: 1.5.0
+                The na_sentinel argument is deprecated and
+                will be removed in a future version of pandas. Specify use_na_sentinel
+                as either True or False.
+
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NaN values. If False,
+            NaN values will be encoded as non-negative integers and will not drop the
+            NaN from the uniques of the values.
+
+            .. versionadded:: 1.5.0
+
         Returns
         -------
         codes : ndarray
@@ -1041,6 +1058,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         #    original ExtensionArray.
         # 2. ExtensionArray.factorize.
         #    Complete control over factorization.
+        resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1899,7 +1899,12 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    def factorize(self, na_sentinel=-1, sort: bool = False):
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+        sort: bool = False,
+    ):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
             codes = np.arange(len(self), dtype=np.intp)
@@ -1909,7 +1914,9 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
                 uniques = uniques[::-1]
             return codes, uniques
         # FIXME: shouldn't get here; we are ignoring sort
-        return super().factorize(na_sentinel=na_sentinel)
+        return super().factorize(
+            na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
 
 
 # -------------------------------------------------------------------

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -59,6 +59,7 @@
 from pandas.core import (
     algorithms as algos,
     arraylike,
+    common as com,
     missing,
     nanops,
     ops,
@@ -869,7 +870,16 @@ def searchsorted(
         return self._data.searchsorted(value, side=side, sorter=sorter)
 
     @doc(ExtensionArray.factorize)
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, ExtensionArray]:
+        resolved_na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
+        if resolved_na_sentinel is None:
+            raise NotImplementedError("Encoding NaN values is not yet implemented")
+        else:
+            na_sentinel = resolved_na_sentinel
         arr = self._data
         mask = self._mask
 

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -848,13 +848,19 @@ def _values_for_factorize(self):
         # Still override this for hash_pandas_object
         return np.asarray(self), self.fill_value
 
-    def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
+    def factorize(
+        self,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ) -> tuple[np.ndarray, SparseArray]:
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        codes, uniques = algos.factorize(
+            np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1136,8 +1136,15 @@ def _memory_usage(self, deep: bool = False) -> int:
             """
         ),
     )
-    def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
-        return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
+    def factorize(
+        self,
+        sort: bool = False,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
+    ):
+        return algorithms.factorize(
+            self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
+        )
 
     _shared_docs[
         "searchsorted"

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -704,3 +704,57 @@ def deprecate_numeric_only_default(cls: type, name: str, deprecate_none: bool =
     )
 
     warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
+
+
+def resolve_na_sentinel(
+    na_sentinel: int | None | lib.NoDefault,
+    use_na_sentinel: bool | lib.NoDefault,
+    warn: bool = True,
+) -> int | None:
+    """
+    Determine value of na_sentinel for factorize methods.
+
+    See GH#46910 for details on the deprecation.
+
+    Parameters
+    ----------
+    na_sentinel : int, None, or lib.no_default
+        Value passed to the method.
+    use_na_sentinel : bool or lib.no_default
+        Value passed to the method.
+    warn : bool, default True
+        Whether to emit a warning if a deprecated use is detected.
+
+    Returns
+    -------
+    Resolved value of na_sentinel.
+    """
+    if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
+        raise ValueError(
+            "Cannot specify both na_sentinel and use_na_sentile; "
+            f"got na_sentinel={na_sentinel} and use_na_sentinel={use_na_sentinel}"
+        )
+    if na_sentinel is lib.no_default:
+        result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
+    else:
+        if warn:
+            if na_sentinel is None:
+                msg = (
+                    "Specifying na_sentinel=None is deprecated, specify "
+                    "use_na_sentinel=False instead."
+                )
+            elif na_sentinel == -1:
+                msg = (
+                    "Specifying na_sentinel=-1 is deprecated, specify "
+                    "use_na_sentinel=True instead."
+                )
+            else:
+                msg = (
+                    "Specifying the specific value to use for na_sentinel is "
+                    "deprecated and will be removed in a future version of pandas. "
+                    "Specify na_sentinel=True to use the sentinel value -1, and "
+                    "na_sentinel=False to encode NaN values."
+                )
+            warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
+        result = na_sentinel
+    return result
@@ -681,13 +681,9 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             codes = self.grouping_vector.codes_info
             uniques = self.grouping_vector.result_index._values
         else:
-            # GH35667, replace dropna=False with na_sentinel=None
-            if not self._dropna:
-                na_sentinel = None
-            else:
-                na_sentinel = -1
+            # GH35667, replace dropna=False with use_na_sentinel=False
             codes, uniques = algorithms.factorize(
-                self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel
+                self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
             )
         return codes, uniques
 

diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -510,8 +510,13 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
         return result
 
     def factorize(
-        self, sort: bool = False, na_sentinel: int | None = -1
+        self,
+        sort: bool = False,
+        na_sentinel: int | lib.NoDefault = lib.no_default,
+        use_na_sentinel: bool | lib.NoDefault = lib.no_default,
     ) -> tuple[npt.NDArray[np.intp], RangeIndex]:
+        # resolve to emit warning if appropriate
+        _ = com.resolve_na_sentinel(na_sentinel, use_na_sentinel)
         codes = np.arange(len(self), dtype=np.intp)
         uniques = self
         if sort and self.step < 0: