Skip to content

Commit

Permalink
ENH: use faster Cython code for DataFrame.count, GH #341
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Nov 7, 2011
1 parent 79a8609 commit 9100b1d
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 68 deletions.
72 changes: 21 additions & 51 deletions pandas/core/frame.py
Expand Up @@ -2435,52 +2435,26 @@ def count(self, axis=0, level=None, numeric_only=False):
return result

def _count_level(self, level, axis=0, numeric_only=False):
# TODO: deal with sortedness??
obj = self.sortlevel(level, axis=axis)
axis_index = obj._get_axis(axis)
y, _ = self._get_agg_data(axis, numeric_only=numeric_only)
mask = notnull(y)

level_index = axis_index.levels[level]
if numeric_only:
frame = self._get_numeric_data()
else:
frame = self

if len(self) == 0:
return DataFrame(np.zeros((len(level_index),
len(self.columns)), dtype=int),
index=level_index, columns=self.columns)
if axis == 1:
frame = frame.T

n = len(level_index)
locs = axis_index.labels[level].searchsorted(np.arange(n))
mask = notnull(frame.values)
level_index = frame.index.levels[level]
counts = lib.count_level_2d(mask, frame.index.labels[level],
len(level_index))

# WORKAROUND: reduceat fusses about the endpoints. should file ticket?
start = locs.searchsorted(0, side='right') - 1
end = locs.searchsorted(len(mask), side='left')
result = DataFrame(counts, index=level_index,
columns=frame.columns)

if axis == 0:
index = level_index
columns = self.columns
result = np.zeros((n, len(self.columns)), dtype=int)
out = result[start:end]
np.add.reduceat(mask, locs[start:end], axis=axis, out=out)
if axis == 1:
return result.T
else:
index = self.index
columns = level_index
result = np.zeros((len(self.index), n), dtype=int)
out = result[:, start:end]
np.add.reduceat(mask, locs[start:end], axis=axis, out=out)

# WORKAROUND: to see why, try this
# arr = np.ones((10, 4), dtype=bool)
# np.add.reduceat(arr, [0, 3, 3, 7, 9], axis=0)

# this stinks
if len(locs) > 1:
workaround_mask = locs[:-1] == locs[1:]
if axis == 0:
result[:-1][workaround_mask] = 0
else:
result[:, :-1][:, workaround_mask] = 0

return DataFrame(result, index=index, columns=columns)
return result

def sum(self, axis=0, numeric_only=True, skipna=True, level=None):
if level is not None:
Expand Down Expand Up @@ -2568,7 +2542,7 @@ def median(self, axis=0, skipna=True, level=None):
return self._agg_by_level('median', axis=axis, level=level,
skipna=skipna)

frame = self._get_numeric_frame()
frame = self._get_numeric_data()

if axis == 0:
values = frame.values.T
Expand Down Expand Up @@ -2598,7 +2572,7 @@ def mad(self, axis=0, skipna=True, level=None):
return self._agg_by_level('mad', axis=axis, level=level,
skipna=skipna)

frame = self._get_numeric_frame()
frame = self._get_numeric_data()

if axis == 0:
demeaned = frame - frame.mean(axis=0)
Expand Down Expand Up @@ -2665,12 +2639,6 @@ def skew(self, axis=0, skipna=True, level=None):
return Series(result, index=axis_labels)
_add_stat_doc(skew, 'unbiased skewness', 'skew')

def _get_numeric_frame(self):
frame = self
if self._is_mixed_type:
frame = self.ix[:, self._get_numeric_columns()]
return frame

def _agg_by_level(self, name, axis=0, level=0, skipna=True):
method = getattr(type(self), name)
applyf = lambda x: method(x, axis=axis, skipna=skipna)
Expand Down Expand Up @@ -2945,8 +2913,10 @@ def _write_to_buffer(self):
to_write = []

if len(frame.columns) == 0 or len(frame.index) == 0:
to_write.append('Empty %s\n' % type(self.frame).__name__)
to_write.append(repr(frame.index))
info_line = 'Empty %s\nColumns: %s\nIndex: %s'
to_write.append(info_line % (type(self.frame).__name__,
repr(frame.columns),
repr(frame.index)))
else:
# may include levels names also
str_index = self._get_formatted_index()
Expand Down
24 changes: 10 additions & 14 deletions pandas/core/series.py
Expand Up @@ -597,23 +597,19 @@ def count(self, level=None):
nobs : int or Series (if level specified)
"""
if level is not None:
return self._count_level(level)
mask = notnull(self.values)
level_index = self.index.levels[level]

return notnull(self.values).sum()

def _count_level(self, level):
# TODO: GENERALIZE CODE OVERLAP WITH DATAFRAME
mask = notnull(self.values)
level_index = self.index.levels[level]
if len(self) == 0:
return Series(0, index=level_index)

if len(self) == 0:
return Series(0, index=level_index)
# call cython function
max_bin = len(level_index)
counts = lib.count_level_1d(mask.view(np.uint8),
self.index.labels[level], max_bin)
return Series(counts, index=level_index)

# call cython function
max_bin = len(level_index)
counts = lib.count_level_1d(mask.view(np.uint8),
self.index.labels[level], max_bin)
return Series(counts, index=level_index)
return notnull(self.values).sum()

def value_counts(self):
"""
Expand Down
18 changes: 17 additions & 1 deletion pandas/src/groupby.pyx
Expand Up @@ -526,7 +526,7 @@ def _bucket_locs(index, buckets, inclusive=False):
return locs

def count_level_1d(ndarray[uint8_t, cast=True] mask,
ndarray[int32_t] labels, Py_ssize_t max_bin):
ndarray[int32_t] labels, Py_ssize_t max_bin):
cdef:
Py_ssize_t i, n
ndarray[int64_t] counts
Expand All @@ -541,6 +541,22 @@ def count_level_1d(ndarray[uint8_t, cast=True] mask,

return counts

def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
ndarray[int32_t] labels, Py_ssize_t max_bin):
cdef:
Py_ssize_t i, j, k, n
ndarray[int64_t, ndim=2] counts

n, k = (<object> mask).shape
counts = np.zeros((max_bin, k), dtype='i8')

for i from 0 <= i < n:
for j from 0 <= j < k:
if mask[i, j]:
counts[labels[i], j] += 1

return counts


'''
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/test_frame.py
Expand Up @@ -1351,8 +1351,9 @@ def test_to_string(self):
joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]])
recons = read_table(StringIO(joined), names=header, sep=' ')
assert_series_equal(recons['B'], biggie['B'])
assert_series_equal(np.round(recons['A'], 2),
np.round(biggie['A'], 2))
self.assertEqual(recons['A'].count(), biggie['A'].count())
self.assert_((np.abs(recons['A'].dropna() -
biggie['A'].dropna()) < 0.1).all())

# expected = ['B', 'A']
# self.assertEqual(header, expected)
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/test_multilevel.py
Expand Up @@ -294,6 +294,11 @@ def _check_counts(frame, axis=0):
result = frame.count(axis=axis, level=i)
expected = frame.groupby(axis=axis, level=i).count(axis=axis)

self.frame.ix[1, [1, 2]] = np.nan
self.frame.ix[7, [0, 1]] = np.nan
self.ymd.ix[1, [1, 2]] = np.nan
self.ymd.ix[7, [0, 1]] = np.nan

_check_counts(self.frame)
_check_counts(self.ymd)
_check_counts(self.frame.T, axis=1)
Expand Down

0 comments on commit 9100b1d

Please sign in to comment.