From 394bb0d9d84fb6628d212c6a5ad0c38de9d06a7c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 24 Oct 2011 00:59:28 -0400 Subject: [PATCH] ENH: add Panel.take, implement set ops between MultiIndex and Index. plus test coverage --- RELEASE.rst | 2 + pandas/__init__.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 24 +++++++++- pandas/core/index.py | 80 +++++++++++++++------------------ pandas/core/internals.py | 29 +++++------- pandas/core/panel.py | 6 ++- pandas/core/reshape.py | 5 ++- pandas/core/series.py | 17 +++---- pandas/tests/test_index.py | 42 ++++++++++++++--- pandas/tests/test_multilevel.py | 8 ++++ pandas/tests/test_panel.py | 33 ++++++++++++-- 12 files changed, 161 insertions(+), 89 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index a10daa819e8d0..9e49899be125f 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -128,6 +128,7 @@ feedback on the library. - Added `pivot_table` convenience function to pandas namespace (GH #234) - Implemented `Panel.rename_axis` function (GH #243) - DataFrame will show index level names in console output + - Implemented `Panel.take` **Improvements to existing features** @@ -189,6 +190,7 @@ feedback on the library. issue GH #262 - Can pass list of tuples to `Series` (GH #270) - Can pass level name to `DataFrame.stack` + - Support set operations between MultiIndex and Index Thanks ------ diff --git a/pandas/__init__.py b/pandas/__init__.py index ae69b6f7a907f..fb7f14c522daa 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -8,7 +8,7 @@ try: import pandas._tseries as lib -except Exception, e: +except Exception, e: # pragma: no cover if 'No module named' in e.message: raise ImportError('C extensions not built: if you installed already ' 'verify that you are not importing from the source ' diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f1beca73304e..834e1f03ac468 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2308,7 +2308,7 @@ def count(self, axis=0, level=None, numeric_only=False): else: frame = self - result = frame.apply(Series.count, axis=axis) + result = DataFrame.apply(frame, Series.count, axis=axis) # what happens with empty DataFrame if isinstance(result, DataFrame): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8689542035107..f4ff2ab0936d5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -204,7 +204,7 @@ def sort_index(self, axis=0, ascending=True): def ix(self): raise NotImplementedError - def reindex(self, **kwds): + def reindex(self, *args, **kwds): raise NotImplementedError class NDFrame(PandasObject): @@ -486,3 +486,25 @@ def rename_axis(self, mapper, axis=0, copy=True): new_data = new_data.copy() return self._constructor(new_data) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + + Parameters + ---------- + indices : list / array of ints + axis : int, default 0 + + Returns + ------- + taken : type of caller + """ + if axis == 0: + labels = self._get_axis(axis) + new_items = labels.take(indices) + new_data = self._data.reindex_items(new_items) + else: + new_data = self._data.take(indices, axis=axis) + return self._constructor(new_data) + diff --git a/pandas/core/index.py b/pandas/core/index.py index ba46d4ddda129..914ac9fd6d543 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -39,6 +39,11 @@ class Index(np.ndarray): ---- An Index instance can **only** contain hashable objects """ + _map_indices = lib.map_indices_object + _is_monotonic = lib.is_monotonic_object + _groupby = lib.groupby_object + _arrmap = lib.arrmap_object + name = None def __new__(cls, data, dtype=None, copy=False, name=None): if isinstance(data, np.ndarray): @@ -67,6 +72,10 @@ def dtype(self): def nlevels(self): return 1 + @property + def _constructor(self): + return Index + def summary(self): if len(self) > 0: index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) @@ -82,15 +91,16 @@ def values(self): @cache_readonly def is_monotonic(self): - return lib.is_monotonic_object(self) + return self._is_monotonic(self) _indexMap = None _integrity = False + @property def indexMap(self): "{label -> location}" if self._indexMap is None: - self._indexMap = lib.map_indices_object(self) + self._indexMap = self._map_indices(self) self._integrity = len(self._indexMap) == len(self) if not self._integrity: @@ -185,7 +195,7 @@ def take(self, *args, **kwargs): Analogous to ndarray.take """ taken = self.view(np.ndarray).take(*args, **kwargs) - return Index(taken, name=self.name) + return self._constructor(taken, name=self.name) def format(self, name=False): """ @@ -305,7 +315,7 @@ def union(self, other): return _ensure_index(other) if self.is_monotonic and other.is_monotonic: - result = lib.outer_join_indexer_object(self, other)[0] + result = lib.outer_join_indexer_object(self, other.values)[0] else: indexer = self.get_indexer(other) indexer = (indexer == -1).nonzero()[0] @@ -356,9 +366,10 @@ def intersection(self, other): other = other.astype(object) if self.is_monotonic and other.is_monotonic: - return Index(lib.inner_join_indexer_object(self, other)[0]) + return Index(lib.inner_join_indexer_object(self, + other.values)[0]) else: - indexer = self.get_indexer(other) + indexer = self.get_indexer(other.values) indexer = indexer.take((indexer != -1).nonzero()[0]) return self.take(indexer) @@ -446,10 +457,10 @@ def get_indexer(self, target, method=None): return indexer def groupby(self, to_groupby): - return lib.groupby_object(self.values, to_groupby) + return self._groupby(self.values, to_groupby) def map(self, mapper): - return lib.arrmap_object(self.values, mapper) + return self._arrmap(self.values, mapper) def _get_method(self, method): if method: @@ -621,6 +632,11 @@ def copy(self, order='C'): class Int64Index(Index): + _map_indices = lib.map_indices_int64 + _is_monotonic = lib.is_monotonic_int64 + _groupby = lib.groupby_int64 + _arrmap = lib.arrmap_int64 + def __new__(cls, data, dtype=None, copy=False, name=None): if not isinstance(data, np.ndarray): if np.isscalar(data): @@ -648,6 +664,10 @@ def __new__(cls, data, dtype=None, copy=False, name=None): subarr.name = name return subarr + @property + def _constructor(self): + return Int64Index + def astype(self, dtype): return Index(self.values.astype(dtype)) @@ -655,22 +675,6 @@ def astype(self, dtype): def dtype(self): return np.dtype('int64') - @cache_readonly - def is_monotonic(self): - return lib.is_monotonic_int64(self) - - @property - def indexMap(self): - "{label -> location}" - if self._indexMap is None: - self._indexMap = lib.map_indices_int64(self) - self._integrity = len(self._indexMap) == len(self) - - if not self._integrity: - raise Exception('Index cannot contain duplicate values!') - - return self._indexMap - def is_all_dates(self): """ Checks that all the labels are datetime objects @@ -771,19 +775,6 @@ def union(self, other): return Int64Index(result) union.__doc__ = Index.union.__doc__ - def groupby(self, to_groupby): - return lib.groupby_int64(self, to_groupby) - - def map(self, mapper): - return lib.arrmap_int64(self, mapper) - - def take(self, *args, **kwargs): - """ - Analogous to ndarray.take - """ - taken = self.values.take(*args, **kwargs) - return Int64Index(taken, name=self.name) - class DateIndex(Index): pass @@ -1267,16 +1258,9 @@ def get_indexer(self, target, method=None): """ method = self._get_method(method) + target_index = target if isinstance(target, MultiIndex): target_index = target.get_tuple_index() - else: - if len(target) > 0: - val = target[0] - if not isinstance(val, tuple) or len(val) != self.nlevels: - raise ValueError('can only pass MultiIndex or ' - 'array of tuples') - - target_index = target self_index = self.get_tuple_index() @@ -1509,6 +1493,9 @@ def union(self, other): ------- Index """ + if not isinstance(other, MultiIndex): + return other.union(self) + self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other): @@ -1533,6 +1520,9 @@ def intersection(self, other): ------- Index """ + if not isinstance(other, MultiIndex): + return other.intersection(self) + self._assert_can_do_setop(other) if self.equals(other): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 05a5526bbbb2b..27fc245bf0547 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -176,31 +176,19 @@ def should_store(self, value): # unnecessarily return issubclass(value.dtype.type, np.floating) - def can_store(self, value): - return issubclass(value.dtype.type, (np.integer, np.floating)) - class IntBlock(Block): def should_store(self, value): - return self.can_store(value) - - def can_store(self, value): return issubclass(value.dtype.type, np.integer) class BoolBlock(Block): def should_store(self, value): - return self.can_store(value) - - def can_store(self, value): return issubclass(value.dtype.type, np.bool_) class ObjectBlock(Block): def should_store(self, value): - return self.can_store(value) - - def can_store(self, value): return not issubclass(value.dtype.type, (np.integer, np.floating, np.bool_)) @@ -676,21 +664,24 @@ def reindex_items(self, new_items): return BlockManager(new_blocks, new_axes) - def take(self, indexer, axis=1, pandas_indexer=False): + def take(self, indexer, axis=1): if axis == 0: raise NotImplementedError - if pandas_indexer: - take_f = lambda arr: common.take_fast(arr, indexer, - None, False, axis=axis) - else: - take_f = lambda arr: arr.take(indexer, axis=axis) + indexer = np.asarray(indexer, dtype='i4') + + n = len(self.axes[axis]) + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') new_axes = list(self.axes) new_axes[axis] = self.axes[axis].take(indexer) new_blocks = [] for blk in self.blocks: - newb = make_block(take_f(blk.values), blk.items, self.items) + new_values = common.take_fast(blk.values, indexer, + None, False, axis=axis) + newb = make_block(new_values, blk.items, self.items) new_blocks.append(newb) return BlockManager(new_blocks, new_axes) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index bc591c530f3ca..95bba59e77cd3 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -665,7 +665,8 @@ def fillna(self, value=None, method='pad'): try: divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 divide = div = _panel_arith_method(operator.truediv, 'divide') def major_xs(self, key, copy=True): @@ -1235,7 +1236,8 @@ def _combine_panel_frame(self, other, func, axis='items'): try: divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 divide = div = _panel_arith_method(operator.truediv, 'divide') def to_wide(self): diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index a746f1286781a..533deef603a6d 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -287,6 +287,9 @@ def stack(frame, level=-1, dropna=True): stacked : Series """ N, K = frame.shape + if isinstance(level, int) and level < 0: + level += frame.columns.nlevels + level = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): @@ -318,8 +321,6 @@ def stack(frame, level=-1, dropna=True): def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() - if level < 0: - level += frame.columns.nlevels # this makes life much simpler if level != frame.columns.nlevels - 1: diff --git a/pandas/core/series.py b/pandas/core/series.py index 2763e5216bcc3..e6648c677070b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -9,12 +9,6 @@ import itertools import operator -try: - from collections import Counter -except ImportError: - # For Python < 2.7, we include a local copy of this: - from pandas.util.counter import Counter - from numpy import nan, ndarray import numpy as np @@ -444,7 +438,7 @@ def iteritems(self): return itertools.izip(iter(self.index), iter(self)) iterkv = iteritems - if py3compat.PY3: + if py3compat.PY3: # pragma: no cover items = iteritems #---------------------------------------------------------------------- @@ -908,6 +902,12 @@ def describe(self): ------- desc : Series """ + try: + from collections import Counter + except ImportError: # pragma: no cover + # For Python < 2.7, we include a local copy of this: + from pandas.util.counter import Counter + if self.dtype == object: names = ['count', 'unique', 'top', 'freq'] @@ -1094,7 +1094,8 @@ def _binop(self, other, func, fill_value=None): mul = _flex_method(operator.mul, 'multiply') try: div = _flex_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 div = _flex_method(operator.truediv, 'divide') def combine(self, other, func, fill_value=nan): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index d2308f4e5e126..e82bb58acd0fc 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -34,6 +34,7 @@ def test_deepcopy(self): def test_duplicates(self): idx = Index([0, 0, 0]) self.assert_(not idx._verify_integrity()) + self.assertRaises(Exception, getattr, idx, 'indexMap') def test_sort(self): self.assertRaises(Exception, self.strIndex.sort) @@ -582,6 +583,13 @@ def test_constructor_single_level(self): self.assert_(not isinstance(single_level, MultiIndex)) self.assert_(single_level.name == 'first') + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]]) + self.assert_(single_level.name is None) + + def test_constructor_no_levels(self): + self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) + def test_from_arrays(self): arrays = [] for lev, lab in zip(self.index.levels, self.index.labels): @@ -832,9 +840,17 @@ def test_equals(self): self.assert_(not self.index.equals(self.index.get_tuple_index())) # different number of levels - index = MultiIndex(levels=self.index.levels[:-1], - labels=self.index.labels[:-1]) - self.assert_(not self.index.equals(index)) + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + index2 = MultiIndex(levels=index.levels[:-1], + labels=index.labels[:-1]) + self.assert_(not index.equals(index2)) + self.assert_(not index.equal_levels(index2)) # levels are different major_axis = Index(range(4)) @@ -877,8 +893,19 @@ def test_union(self): the_union = self.index.union(self.index[:0]) self.assert_(the_union is self.index) - self.assertRaises(TypeError, self.index.union, - self.index.get_tuple_index()) + tuples = self.index.get_tuple_index() + result = self.index[:4] | tuples[4:] + self.assert_(result.equals(tuples)) + + def test_union_with_regular_index(self): + other = Index(['A', 'B', 'C']) + + result = other.union(self.index) + self.assert_(('foo', 'one') in result) + self.assert_('B' in result) + + result2 = self.index.union(other) + self.assert_(result.equals(result2)) def test_intersection(self): piece1 = self.index[:5][::-1] @@ -893,8 +920,9 @@ def test_intersection(self): the_int = self.index.intersection(self.index) self.assert_(the_int is self.index) - self.assertRaises(TypeError, self.index.intersection, - self.index.get_tuple_index()) + tuples = self.index.get_tuple_index() + result = self.index & tuples + self.assert_(result.equals(tuples)) def test_diff(self): first = self.index diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 53f1d286e962d..1587aa205c6d3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -350,6 +350,10 @@ def test_stack(self): ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + # stack with negative number + result = self.ymd.unstack(0).stack(-2) + expected = self.ymd.unstack(0).stack(0) + def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' @@ -392,6 +396,10 @@ def test_stack_level_name(self): expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) + result = self.frame.stack('exp') + expected = self.frame.stack() + assert_series_equal(result, expected) + def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 387839c09506a..e54485f2e2059 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -419,6 +419,11 @@ def test_xs(self): itemA_view.values[:] = np.nan self.assert_(np.isnan(self.panel['ItemA'].values).all()) + # mixed-type + self.panel['strings'] = 'foo' + self.assertRaises(Exception, self.panel.xs, 'D', axis=2, + copy=False) + def test_getitem_fancy_labels(self): p = self.panel @@ -670,6 +675,20 @@ def test_reindex_like(self): smaller_like = self.panel.reindex_like(smaller) assert_panel_equal(smaller, smaller_like) + def test_take(self): + # axis == 0 + result = self.panel.take([2, 0, 1], axis=0) + expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + assert_panel_equal(result, expected) + + # axis >= 1 + result = self.panel.take([3, 0, 1, 2], axis=2) + expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + assert_panel_equal(result, expected) + + self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) + self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + def test_sort_index(self): import random @@ -985,6 +1004,17 @@ def test_combine_scalar(self): expected = DataFrame(self.panel._data) * 2 assert_frame_equal(result, expected) + def test_combine_series(self): + s = self.panel['ItemA'][:10] + result = self.panel.add(s, axis=0) + expected = DataFrame.add(self.panel, s, axis=0) + assert_frame_equal(result, expected) + + s = self.panel.ix[5] + result = self.panel + s + expected = DataFrame.add(self.panel, s, axis=1) + assert_frame_equal(result, expected) + def test_operators(self): wp = self.panel.to_wide() result = (self.panel + 1).to_wide() @@ -1000,9 +1030,6 @@ def is_sorted(arr): sorted_major = sorted_minor.sortlevel(level=0) self.assert_(is_sorted(sorted_major.major_labels)) - def test_to_wide(self): - pass - def test_toCSV(self): self.panel.toCSV('__tmp__') os.remove('__tmp__')