diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index be4b3f59bead4..456fedb272e18 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -816,16 +816,14 @@ def remove_unused_categories(self, inplace=False): set_categories """ cat = self if inplace else self.copy() - _used = sorted(np.unique(cat._codes)) - if _used[0] == -1: - _used = _used[1:] + idx, inv = np.unique(cat._codes, return_inverse=True) - new_categories = cat.categories.take(_ensure_platform_int(_used)) + if idx.size != 0 and idx[0] == -1: # na sentinel + idx, inv = idx[1:], inv - 1 + + cat._codes = inv + cat._categories = cat.categories.take(idx) - from pandas.core.index import _ensure_index - new_categories = _ensure_index(new_categories) - cat._codes = _get_codes_for_values(cat.__array__(), new_categories) - cat._categories = new_categories if not inplace: return cat diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 3da4ad62b45af..e98c98fdec8b3 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -850,6 +850,21 @@ def test_remove_unused_categories(self): self.assert_numpy_array_equal(res.categories, np.array(["a","b","c"])) self.assert_numpy_array_equal(c.categories, exp_categories_all) + val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] + cat = pd.Categorical(values=val, categories=list('ABCDEFG')) + out = cat.remove_unused_categories() + self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F']) + self.assert_numpy_array_equal(out.codes, [ 2, -1, 1, 0, 1, 2, -1]) + self.assertEqual(out.get_values().tolist(), val) + + alpha = list('abcdefghijklmnopqrstuvwxyz') + val = np.random.choice(alpha[::2], 10000).astype('object') + val[np.random.choice(len(val), 100)] = np.nan + + cat = pd.Categorical(values=val, categories=alpha) + out = cat.remove_unused_categories() + self.assertEqual(out.get_values().tolist(), val.tolist()) + def test_nan_handling(self): # Nans are represented as -1 in codes