Skip to content

Commit

Permalink
BUG: code to address #742, DataFrame.rank
Browse files Browse the repository at this point in the history
  • Loading branch information
adamklein authored and wesm committed Feb 5, 2012
1 parent 475fcd2 commit b4b3583
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 1 deletion.
47 changes: 46 additions & 1 deletion pandas/core/frame.py
Expand Up @@ -3496,6 +3496,16 @@ def _get_numeric_columns(self):

return cols

def _get_nonnumeric_columns(self):
from pandas.core.internals import ObjectBlock

cols = []
for col, blk in zip(self.columns, self._data.block_id_vector):
if isinstance(self._data.blocks[blk], ObjectBlock):
cols.append(col)

return cols

def _get_numeric_data(self):
if self._is_mixed_type:
num_data = self._data.get_numeric_data()
Expand All @@ -3506,6 +3516,15 @@ def _get_numeric_data(self):
else:
return self.ix[:, []]

def _get_nonnumeric_data(self):
if self._is_mixed_type:
return self.ix[:, self._get_nonnumeric_columns()]
else:
if self.values.dtype == np.object_:
return self
else:
return self.ix[:, []]

def quantile(self, q=0.5, axis=0):
"""
Return values at the given quantile over requested axis, a la
Expand Down Expand Up @@ -3588,7 +3607,33 @@ def rank(self, axis=0):
"""
data = self._get_numeric_data()
ranks = lib.rank_2d_float64(data.values.astype('f8'), axis=axis)
return DataFrame(ranks, index=data.index, columns=data.columns)
df = DataFrame(ranks, index=data.index, columns=data.columns)

odata = self._get_nonnumeric_data()
if len(odata):
if axis == 0:
odata = odata.T
df = df.T

for col in odata.columns:
try:
ranked = lib.rank_1d_object(odata[col])
if len(df[col]) == 0:
df = df.reindex(self.T.index)
df[col] = ranked
except Exception:
continue

return df.T
else:
for col in odata.columns:
try:
ranked = lib.rank_1d_object(odata[col])
df[col] = ranked
except Exception:
continue
return df
return df

#----------------------------------------------------------------------
# Plotting
Expand Down
42 changes: 42 additions & 0 deletions pandas/src/stats.pyx
Expand Up @@ -132,3 +132,45 @@ def rank_1d_generic(object in_arr):
ranks[argsorted[j]] = sum_ranks / dups
sum_ranks = dups = 0
return ranks

def rank_1d_object(object in_arr):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""

cdef:
Py_ssize_t i, j, n, dups = 0
ndarray[float64_t] ranks
ndarray sorted_data, values
ndarray[int64_t] argsorted
int32_t idx
object val, nan_value
float64_t sum_ranks = 0

values = np.asarray(in_arr).copy()

nan_value = np.inf

mask = isnullobj(values.astype('O'))
np.putmask(values, mask, nan_value)

n = len(values)
ranks = np.empty(n, dtype='f8')

# py2.5/win32 hack, can't pass i8
_as = values.argsort()
sorted_data = values.take(_as)
argsorted = _as.astype('i8')

for i in range(n):
sum_ranks += i + 1
dups += 1
val = util.get_value_at(sorted_data, i)
if val == nan_value:
ranks[argsorted[i]] = nan
continue
if (i == n - 1) or util.get_value_at(sorted_data, i + 1) != val:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
sum_ranks = dups = 0
return ranks
10 changes: 10 additions & 0 deletions pandas/tests/test_frame.py
Expand Up @@ -4165,6 +4165,16 @@ def test_rank(self):
assert_almost_equal(ranks0.values, exp0)
assert_almost_equal(ranks1.values, exp1)

def test_rank2(self):
df = DataFrame([['b','c','a'],['a','c','b']])
result = df.rank(1)
expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
assert_frame_equal(result, expected)

result = df.rank(0)
expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
assert_frame_equal(result, expected)

def test_describe(self):
desc = self.tsframe.describe()
desc = self.mixed_frame.describe()
Expand Down

0 comments on commit b4b3583

Please sign in to comment.