Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

PERF: optimize index.__getitem__ for slice & boolean mask indexers #6440

Merged
merged 1 commit into from Feb 28, 2014
Jump to file or symbol
Failed to load files and symbols.
+83 −31
Split
View
@@ -105,6 +105,8 @@ API Changes
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`)
+- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
+ longer change type of the resulting index (:issue:`6440`).
Experimental Features
~~~~~~~~~~~~~~~~~~~~~
View
@@ -78,6 +78,21 @@ These are out-of-bounds selections
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <merging.mixed_ndims>`
+- Slicing and advanced/boolean indexing operations on ``Index`` classes will no
+ longer change type of the resulting index (:issue:`6440`)
+
+ .. ipython:: python
+
+ i = pd.Index([1, 2, 3, 'a' , 'b', 'c'])
+ i[[0,1,2]]
+
+ Previously, the above operation would return ``Int64Index``. If you'd like
+ to do this manually, use :meth:`Index.astype`
+
+ .. ipython:: python
+
+ i[[0,1,2]].astype(np.int_)
+
MultiIndexing Using Slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~
View
@@ -631,34 +631,35 @@ def __hash__(self):
raise TypeError("unhashable type: %r" % type(self).__name__)
def __getitem__(self, key):
- """Override numpy.ndarray's __getitem__ method to work as desired"""
- arr_idx = self.view(np.ndarray)
+ """
+ Override numpy.ndarray's __getitem__ method to work as desired.
+
+ This function adds lists and Series as valid boolean indexers
+ (ndarrays only supports ndarray with dtype=bool).
+
+ If resulting ndim != 1, plain ndarray is returned instead of
+ corresponding `Index` subclass.
+
+ """
+ # There's no custom logic to be implemented in __getslice__, so it's
+ # not overloaded intentionally.
+ __getitem__ = super(Index, self).__getitem__
if np.isscalar(key):
- return arr_idx[key]
- else:
- if com._is_bool_indexer(key):
- key = np.asarray(key)
+ return __getitem__(key)
- try:
- result = arr_idx[key]
- if result.ndim > 1:
- return result
- except (IndexError):
- if not len(key):
- result = []
- else:
- raise
+ if isinstance(key, slice):
+ # This case is separated from the conditional above to avoid
+ # pessimization of basic indexing.
+ return __getitem__(key)
- return Index(result, name=self.name)
+ if com._is_bool_indexer(key):
+ return __getitem__(np.asarray(key))
- def _getitem_slice(self, key):
- """ getitem for a bool/sliceable, fallback to standard getitem """
- try:
- arr_idx = self.view(np.ndarray)
- result = arr_idx[key]
- return self.__class__(result, name=self.name, fastpath=True)
- except:
- return self.__getitem__(key)
+ result = __getitem__(key)
+ if result.ndim > 1:
+ return result.view(np.ndarray)
+ else:
+ return result
def append(self, other):
"""
@@ -2800,8 +2801,6 @@ def __getitem__(self, key):
return result
- _getitem_slice = __getitem__
-
def take(self, indexer, axis=None):
"""
Analogous to ndarray.take
View
@@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False):
if raise_on_error:
_check_slice_bounds(slobj, self.index)
return self.__class__(self._block._slice(slobj),
- self.index._getitem_slice(slobj), fastpath=True)
+ self.index[slobj], fastpath=True)
def set_axis(self, axis, value, maybe_rename=True, check_axis=True):
cur_axis, value = self._set_axis(axis, value, check_axis)
View
@@ -323,6 +323,25 @@ def test_fancy(self):
for i in sl:
self.assertEqual(i, sl[sl.get_loc(i)])
+ def test_empty_fancy(self):
+ empty_farr = np.array([], dtype=np.float_)
+ empty_iarr = np.array([], dtype=np.int_)
+ empty_barr = np.array([], dtype=np.bool_)
+
+ # pd.DatetimeIndex is excluded, because it overrides getitem and should
+ # be tested separately.
+ for idx in [self.strIndex, self.intIndex, self.floatIndex]:
+ empty_idx = idx.__class__([])
+ values = idx.values
+
+ self.assert_(idx[[]].identical(empty_idx))
+ self.assert_(idx[empty_iarr].identical(empty_idx))
+ self.assert_(idx[empty_barr].identical(empty_idx))
+
+ # np.ndarray only accepts ndarray of int & bool dtypes, so should
+ # Index.
+ self.assertRaises(IndexError, idx.__getitem__, empty_farr)
+
def test_getitem(self):
arr = np.array(self.dateIndex)
exp = self.dateIndex[5]
@@ -762,6 +781,14 @@ def test_join_self(self):
joined = res.join(res, how=kind)
self.assertIs(res, joined)
+ def test_indexing_doesnt_change_class(self):
+ idx = Index([1, 2, 3, 'a', 'b', 'c'])
+
+ self.assert_(idx[1:3].identical(
+ pd.Index([2, 3], dtype=np.object_)))
+ self.assert_(idx[[0,1]].identical(
+ pd.Index([1, 2], dtype=np.object_)))
+
class TestFloat64Index(tm.TestCase):
_multiprocess_can_split_ = True
View
@@ -1406,8 +1406,6 @@ def __getitem__(self, key):
return self._simple_new(result, self.name, new_offset, self.tz)
- _getitem_slice = __getitem__
-
# Try to run function on index first, and then on elements of index
# Especially important for group-by functionality
def map(self, f):
View
@@ -1056,8 +1056,6 @@ def __getitem__(self, key):
return PeriodIndex(result, name=self.name, freq=self.freq)
- _getitem_slice = __getitem__
-
def _format_with_header(self, header, **kwargs):
return header + self._format_native_types(**kwargs)
View
@@ -46,3 +46,16 @@
index_int64_intersection = Benchmark('left.intersection(right)', setup,
start_date=datetime(2011, 1, 1))
+
+#----------------------------------------------------------------------
+# string index slicing
+setup = common_setup + """
+idx = tm.makeStringIndex(1000000)
+
+mask = np.arange(1000000) % 3 == 0
+series_mask = Series(mask)
+"""
+index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup)
+index_str_slice_indexer_even = Benchmark('idx[::2]', setup)
+index_str_boolean_indexer = Benchmark('idx[mask]', setup)
+index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup)