From 82136d2a7fe8f5a2c1177423f13312420cffe905 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sat, 25 Feb 2017 16:31:22 +0530 Subject: [PATCH 1/3] PERF: categorical rank GH#15498 --- pandas/core/algorithms.py | 8 +++++--- pandas/core/categorical.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b11927a80fb2e2..7f74067f7826b2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -974,9 +974,6 @@ def _get_data_algo(values, func_map): f = None - if is_categorical_dtype(values): - values = values._values_for_rank() - if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -992,6 +989,11 @@ def _get_data_algo(values, func_map): elif is_unsigned_integer_dtype(values): f = func_map['uint64'] values = _ensure_uint64(values) + + elif is_categorical_dtype(values): + f = func_map['float64'] + values = values._values_for_rank() + else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b88a6b171b316c..3a83a485c8c3bb 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1416,14 +1416,17 @@ def _values_for_rank(self): numpy array """ + from pandas import Series if self.ordered: values = self.codes mask = values == -1 + values = values.astype('float64') if mask.any(): - values = values.astype('float64') values[mask] = np.nan else: - values = np.array(self) + values = np.array( + self.rename_categories(Series(self.categories).rank()) + ) return values def order(self, inplace=False, ascending=True, na_position='last'): From 41e7b27ce9b4d62fac5a0c9e1f97a0f857e58aed Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Mon, 27 Feb 2017 19:06:55 +0530 Subject: [PATCH 2/3] PERF: categorical rank GH#15498 no need to rename categories where they are already ordered --- pandas/core/algorithms.py | 7 +++---- pandas/core/categorical.py | 4 +++- pandas/tests/series/test_analytics.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7f74067f7826b2..55d404f05dd1d5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -974,6 +974,9 @@ def _get_data_algo(values, func_map): f = None + if is_categorical_dtype(values): + values = values._values_for_rank() + if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) @@ -990,10 +993,6 @@ def _get_data_algo(values, func_map): f = func_map['uint64'] values = _ensure_uint64(values) - elif is_categorical_dtype(values): - f = func_map['float64'] - values = values._values_for_rank() - else: values = _ensure_object(values) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 3a83a485c8c3bb..a8ccae0d374cd1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1420,9 +1420,11 @@ def _values_for_rank(self): if self.ordered: values = self.codes mask = values == -1 - values = values.astype('float64') if mask.any(): + values = values.astype('float64') values[mask] = np.nan + elif self.categories.is_monotonic: + values = np.array(self) else: values = np.array( self.rename_categories(Series(self.categories).rank()) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b092e4f0847670..1733c515b272ca 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1083,6 +1083,16 @@ def test_rank_categorical(self): res = unordered.rank() assert_series_equal(res, exp_unordered) + unordered1 = pd.Series( + [1, 2, 3, 4, 5, 6], + ).astype('category').cat.set_categories( + [1, 2, 3, 4, 5, 6], + ordered=False + ) + exp_unordered1 = pd.Series([1., 2., 3., 4., 5., 6.]) + res1 = unordered1.rank() + assert_series_equal(res1, exp_unordered1) + # Test na_option for rank data na_ser = pd.Series( ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] From f7eab32aed9d59547abba263cf084d29627e8290 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Mon, 27 Feb 2017 19:17:11 +0530 Subject: [PATCH 3/3] PERF: categorical rank GH#15498 check for numeric instead of monotonic --- pandas/core/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a8ccae0d374cd1..2326cc3c78b72d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1423,7 +1423,7 @@ def _values_for_rank(self): if mask.any(): values = values.astype('float64') values[mask] = np.nan - elif self.categories.is_monotonic: + elif self.categories.is_numeric(): values = np.array(self) else: values = np.array(