Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

ENH: rank na_options top and bottom #1508 #2159

Closed
wants to merge 1 commit into
from
Jump to file or symbol
Failed to load files and symbols.
+91 −19
Split
@@ -191,11 +191,12 @@ def rank(values, axis=0, method='average', na_option='keep',
"""
if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
- ranks = f(values, ties_method=method, ascending=ascending)
+ ranks = f(values, ties_method=method, ascending=ascending,
+ na_option=na_option)
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
ranks = f(values, axis=axis, ties_method=method,
- ascending=ascending)
+ ascending=ascending, na_option=na_option)
return ranks
View
@@ -4704,8 +4704,10 @@ def rank(self, axis=0, numeric_only=None, method='average',
min: lowest rank in group
max: highest rank in group
first: ranks assigned in order they appear in the array
- na_option : {'keep'}
+ na_option : {'keep', 'top', 'bottom'}
keep: leave NA values where they are
+ top: smallest rank if ascending
+ bottom: smallest rank if descending
ascending : boolean, default True
False for ranks by high (1) to low (N)
@@ -4716,7 +4718,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
if numeric_only is None:
try:
ranks = algos.rank(self.values, axis=axis, method=method,
- ascending=ascending)
+ ascending=ascending, na_option=na_option)
return DataFrame(ranks, index=self.index, columns=self.columns)
except TypeError:
numeric_only = True
@@ -4726,7 +4728,7 @@ def rank(self, axis=0, numeric_only=None, method='average',
else:
data = self
ranks = algos.rank(data.values, axis=axis, method=method,
- ascending=ascending)
+ ascending=ascending, na_option=na_option)
return DataFrame(ranks, index=data.index, columns=data.columns)
def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
View
@@ -70,7 +70,8 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
return result
-def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
+def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
+ na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
@@ -86,7 +87,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
values = np.asarray(in_arr).copy()
- if ascending:
+ if ascending ^ (na_option == 'top'):
@wesm

wesm Nov 2, 2012

Owner

nice w/ the xor

nan_value = np.inf
else:
nan_value = -np.inf
@@ -115,7 +116,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
sum_ranks += i + 1
dups += 1
val = sorted_data[i]
- if val == nan_value:
+ if (val == nan_value) and (na_option == 'keep'):
@wesm

wesm Nov 2, 2012

Owner

This will dog performance pretty badly. i'll merge and then tweak this

@wesm

wesm Nov 2, 2012

Owner

actually it's pretty minor (only about a 10% penalty in a 1mm-length Series that's 50% NA)

ranks[argsorted[i]] = nan
continue
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
@@ -138,7 +139,8 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True):
return ranks
-def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
+def rank_1d_int64(object in_arr, ties_method='average', ascending=True,
+ na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
@@ -198,7 +200,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True):
def rank_2d_float64(object in_arr, axis=0, ties_method='average',
- ascending=True):
+ ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
@@ -219,7 +221,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
else:
values = in_arr.copy()
- if ascending:
+ if ascending ^ (na_option == 'top'):
nan_value = np.inf
else:
nan_value = -np.inf
@@ -249,7 +251,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
sum_ranks += j + 1
dups += 1
val = values[i, j]
- if val == nan_value:
+ if val == nan_value and na_option == 'keep':
ranks[i, argsorted[i, j]] = nan
continue
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
@@ -277,7 +279,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
def rank_2d_int64(object in_arr, axis=0, ties_method='average',
- ascending=True):
+ ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
@@ -345,7 +347,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average',
def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
- ascending=True):
+ ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
@@ -365,7 +367,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
if values.dtype != np.object_:
values = values.astype('O')
- if ascending:
+ if ascending ^ (na_option == 'top'):
# always greater than everything
nan_value = Infinity()
else:
@@ -401,7 +403,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average',
sum_ranks += i + 1
dups += 1
val = util.get_value_at(sorted_data, i)
- if val is nan_value:
+ if val is nan_value and na_option=='keep':
ranks[argsorted[i]] = nan
continue
if (i == n - 1 or
@@ -450,7 +452,7 @@ class NegInfinity(object):
__cmp__ = _return_true
def rank_2d_generic(object in_arr, axis=0, ties_method='average',
- ascending=True):
+ ascending=True, na_option='keep'):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""
@@ -475,7 +477,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
if values.dtype != np.object_:
values = values.astype('O')
- if ascending:
+ if ascending ^ (na_option == 'top'):
# always greater than everything
nan_value = Infinity()
else:
@@ -510,7 +512,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average',
dups = sum_ranks = infs = 0
for j in range(k):
val = values[i, j]
- if val is nan_value:
+ if val is nan_value and na_option == 'keep':
ranks[i, argsorted[i, j]] = nan
infs += 1
continue
View
@@ -6444,6 +6444,73 @@ def test_rank2(self):
expected = self.mixed_frame.rank(1, numeric_only=True)
assert_frame_equal(result, expected)
+ def test_rank_na_option(self):
+ from pandas.compat.scipy import rankdata
+
+ self.frame['A'][::2] = np.nan
+ self.frame['B'][::3] = np.nan
+ self.frame['C'][::4] = np.nan
+ self.frame['D'][::5] = np.nan
+
+ #bottom
+ ranks0 = self.frame.rank(na_option='bottom')
+ ranks1 = self.frame.rank(1, na_option='bottom')
+
+ fvals = self.frame.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, fvals)
+ exp1 = np.apply_along_axis(rankdata, 1, fvals)
+
+ assert_almost_equal(ranks0.values, exp0)
+ assert_almost_equal(ranks1.values, exp1)
+
+ #top
+ ranks0 = self.frame.rank(na_option='top')
+ ranks1 = self.frame.rank(1, na_option='top')
+
+ fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
+ fval1 = self.frame.T
+ fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
+ fval1 = fval1.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, fval0)
+ exp1 = np.apply_along_axis(rankdata, 1, fval1)
+
+ assert_almost_equal(ranks0.values, exp0)
+ assert_almost_equal(ranks1.values, exp1)
+
+ #descending
+
+ #bottom
+ ranks0 = self.frame.rank(na_option='top', ascending=False)
+ ranks1 = self.frame.rank(1, na_option='top', ascending=False)
+
+ fvals = self.frame.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, -fvals)
+ exp1 = np.apply_along_axis(rankdata, 1, -fvals)
+
+ assert_almost_equal(ranks0.values, exp0)
+ assert_almost_equal(ranks1.values, exp1)
+
+ #descending
+
+ #top
+ ranks0 = self.frame.rank(na_option='bottom', ascending=False)
+ ranks1 = self.frame.rank(1, na_option='bottom', ascending=False)
+
+ fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values
+ fval1 = self.frame.T
+ fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
+ fval1 = fval1.fillna(np.inf).values
+
+ exp0 = np.apply_along_axis(rankdata, 0, -fval0)
+ exp1 = np.apply_along_axis(rankdata, 1, -fval1)
+
+ assert_almost_equal(ranks0.values, exp0)
+ assert_almost_equal(ranks1.values, exp1)
+
+
def test_describe(self):
desc = self.tsframe.describe()
desc = self.mixed_frame.describe()