diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 27cd320c661e0..d938cc6a6dc4d 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -88,7 +88,7 @@ def setup(self): def time_getitem_scalar(self): self.ts[self.dt] - + class DataFrameIndexing(object): goal_time = 0.2 @@ -189,6 +189,15 @@ def setup(self): self.eps_C = 5 self.eps_D = 5000 self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() + self.miint = MultiIndex.from_product( + [np.arange(1000), + np.arange(1000)], names=['one', 'two']) + + import string + self.mistring = MultiIndex.from_product( + [np.arange(1000), + np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) def time_series_xs_mi_ix(self): self.s.ix[999] @@ -197,7 +206,24 @@ def time_frame_xs_mi_ix(self): self.df.ix[999] def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + self.mdt2.loc[self.idx[ + (self.test_A - self.eps_A):(self.test_A + self.eps_A), + (self.test_B - self.eps_B):(self.test_B + self.eps_B), + (self.test_C - self.eps_C):(self.test_C + self.eps_C), + (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + + def time_multiindex_get_indexer(self): + self.miint.get_indexer( + np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object)) + + def time_multiindex_string_get_loc(self): + self.mistring.get_loc((999, 19, 'Z')) + + def time_is_monotonic(self): + self.miint.is_monotonic class PanelIndexing(object): diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8db0cd7629332..6fe6c32a96df9 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -16,8 +16,8 @@ def setup(self): data=np.random.rand(10000, 30), columns=range(30)) # multi-index - N = 1000 - K = 20 + N = 5000 + K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 26006083d81b4..4708abe4d592e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -472,7 +472,7 @@ Performance Improvements - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). - Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - +- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. @@ -502,6 +502,8 @@ Bug Fixes - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) + +- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) - Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 05cfb1bd9ec27..c922ac21e12eb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = _ensure_int64(indexer) + indexer = _ensure_int64(indexer, copy=False) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False @@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 16f8d4658dc20..9c66f6dbb273e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1752,7 +1752,8 @@ def _sizeof_fmt(num, size_qualifier): # all cases (e.g., it misses categorical data even with object # categories) deep = False - if 'object' in counts or is_object_dtype(self.index): + if ('object' in counts or + self.index._is_memory_usage_qualified()): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index cabfa43a76f26..9b352ae1c003b 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -31,6 +31,14 @@ cdef class PyObjectHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) +cdef class MultiIndexHashTable(HashTable): + cdef: + kh_uint64_t *table + object mi + + cpdef get_item(self, object val) + cpdef set_item(self, object key, Py_ssize_t val) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/index.pyx b/pandas/index.pyx index 0c975d1775a03..37fe7d90bebe0 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -182,7 +182,7 @@ cdef class IndexEngine: Py_ssize_t i, n int last_true - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) n = len(values) result = np.empty(n, dtype=bool) @@ -284,7 +284,6 @@ cdef class IndexEngine: if not self.is_mapping_populated: values = self._get_index_values() - self.mapping = self._make_hash_table(len(values)) self.mapping.map_locations(values) @@ -322,7 +321,7 @@ cdef class IndexEngine: Py_ssize_t i, j, n, n_t, n_alloc self._ensure_mapping_populated() - values = self._get_index_values() + values = np.array(self._get_index_values(), copy=False) stargets = set(targets) n = len(values) n_t = len(targets) @@ -554,5 +553,39 @@ cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _du_utc) +cdef class MultiIndexEngine(IndexEngine): + + def _call_monotonic(self, object mi): + # defer these back to the mi iteself + return (mi.is_monotonic_increasing, + mi.is_monotonic_decreasing, + mi.is_unique) + + def get_backfill_indexer(self, other, limit=None): + # we coerce to ndarray-of-tuples + values = np.array(self._get_index_values()) + return algos.backfill_object(values, other, limit=limit) + + def get_pad_indexer(self, other, limit=None): + # we coerce to ndarray-of-tuples + values = np.array(self._get_index_values()) + return algos.pad_object(values, other, limit=limit) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError("'{val}' is an invalid key".format(val=val)) + + self._ensure_mapping_populated() + if not self.unique: + return self._get_loc_duplicates(val) + + try: + return self.mapping.get_item(val) + except TypeError: + raise KeyError(val) + + cdef _make_hash_table(self, n): + return _hash.MultiIndexHashTable(n) + # Generated from template. include "index_class_helper.pxi" diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index bb2941a121452..c483fb0764a4c 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1431,6 +1431,10 @@ def inferred_type(self): """ return a string of the type inferred from the values """ return lib.infer_dtype(self) + def _is_memory_usage_qualified(self): + """ return a boolean if we need a qualified .info display """ + return self.is_object() + def is_type_compatible(self, kind): return kind == self.inferred_type @@ -2446,7 +2450,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): 'if index and target are monotonic' % method) side = 'left' if method == 'pad' else 'right' - target = np.asarray(target) # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 653ba1fee5691..57739548a17d6 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -14,7 +14,6 @@ from pandas.compat.numpy import function as nv from pandas import compat - from pandas.types.common import (_ensure_int64, _ensure_platform_int, is_object_dtype, @@ -73,6 +72,7 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] + _engine_type = _index.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -114,7 +114,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, result._verify_integrity() if _set_identity: result._reset_identity() - return result def _verify_integrity(self, labels=None, levels=None): @@ -429,6 +428,12 @@ def _shallow_copy(self, values=None, **kwargs): def dtype(self): return np.dtype('O') + def _is_memory_usage_qualified(self): + """ return a boolean if we need a qualified .info display """ + def f(l): + return 'mixed' in l or 'string' in l or 'unicode' in l + return any([f(l) for l in self._inferred_type_levels]) + @Appender(Index.memory_usage.__doc__) def memory_usage(self, deep=False): # we are overwriting our base class to avoid @@ -619,6 +624,10 @@ def _get_level_number(self, level): _tuples = None + @cache_readonly + def _engine(self): + return self._engine_type(lambda: self, len(self)) + @property def values(self): if self._tuples is not None: @@ -655,10 +664,95 @@ def _has_complex_internals(self): # to disable groupby tricks return True + @cache_readonly + def is_monotonic(self): + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + return self.is_monotonic_increasing + + @cache_readonly + def is_monotonic_increasing(self): + """ + return if the index is monotonic increasing (only equal or + increasing) values. + """ + + # reversed() because lexsort() wants the most significant key last. + values = [self._get_level_values(i) + for i in reversed(range(len(self.levels)))] + try: + sort_order = np.lexsort(values) + return Index(sort_order).is_monotonic + except TypeError: + + # we have mixed types and np.lexsort is not happy + return Index(self.values).is_monotonic + + @property + def is_monotonic_decreasing(self): + """ + return if the index is monotonic decreasing (only equal or + decreasing) values. + """ + return False + @cache_readonly def is_unique(self): return not self.duplicated().any() + @cache_readonly + def _have_mixed_levels(self): + """ return a boolean list indicated if we have mixed levels """ + return ['mixed' in l for l in self._inferred_type_levels] + + @cache_readonly + def _inferred_type_levels(self): + """ return a list of the inferred types, one for each level """ + return [i.inferred_type for i in self.levels] + + @cache_readonly + def _hashed_values(self): + """ return a uint64 ndarray of my hashed values """ + from pandas.tools.hashing import hash_tuples + return hash_tuples(self) + + def _hashed_indexing_key(self, key): + """ + validate and return the hash for the provided key + + *this is internal for use for the cython routines* + + Paramters + --------- + key : string or tuple + + Returns + ------- + np.uint64 + + Notes + ----- + we need to stringify if we have mixed levels + + """ + from pandas.tools.hashing import hash_tuples + + if not isinstance(key, tuple): + return hash_tuples(key) + + if not len(key) == self.nlevels: + raise KeyError + + def f(k, stringify): + if stringify and not isinstance(k, compat.string_types): + k = str(k) + return k + key = tuple([f(k, stringify) + for k, stringify in zip(key, self._have_mixed_levels)]) + return hash_tuples(key) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) @@ -748,26 +842,44 @@ def _try_mi(k): raise InvalidIndexError(key) - def get_level_values(self, level): + def _get_level_values(self, level): """ - Return vector of label values for requested level, equal to the length - of the index + Return vector of label values for requested level, + equal to the length of the index + + **this is an internal method** Parameters ---------- - level : int or level name + level : int level Returns ------- values : ndarray """ - num = self._get_level_number(level) - unique = self.levels[num] # .values - labels = self.labels[num] - filled = algos.take_1d(unique.values, labels, + + unique = self.levels[level] + labels = self.labels[level] + filled = algos.take_1d(unique._values, labels, fill_value=unique._na_value) - values = unique._shallow_copy(filled) - return values + return filled + + def get_level_values(self, level): + """ + Return vector of label values for requested level, + equal to the length of the index + + Parameters + ---------- + level : int or level name + + Returns + ------- + values : Index + """ + level = self._get_level_number(level) + values = self._get_level_values(level) + return self.levels[level]._shallow_copy(values) def format(self, space=2, sparsify=None, adjoin=True, names=False, na_rep=None, formatter=None): @@ -852,7 +964,8 @@ def to_frame(self, index=True): from pandas import DataFrame result = DataFrame({(name or level): self.get_level_values(level) for name, level in - zip(self.names, range(len(self.levels)))}) + zip(self.names, range(len(self.levels)))}, + copy=False) if index: result.index = self return result @@ -1482,29 +1595,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = _ensure_index(target) - target_index = target - if isinstance(target, MultiIndex): - target_index = target._tuple_index + # empty indexer + if is_list_like(target) and not len(target): + return _ensure_platform_int(np.array([])) + + if not isinstance(target, MultiIndex): + try: + target = MultiIndex.from_tuples(target) + except (TypeError, ValueError): - if not is_object_dtype(target_index.dtype): - return np.ones(len(target_index)) * -1 + # let's instead try with a straight Index + if method is None: + return Index(self.values).get_indexer(target, + method=method, + limit=limit, + tolerance=tolerance) if not self.is_unique: raise Exception('Reindexing only valid with uniquely valued Index ' 'objects') - self_index = self._tuple_index - if method == 'pad' or method == 'backfill': if tolerance is not None: raise NotImplementedError("tolerance not implemented yet " 'for MultiIndex') - indexer = self_index._get_fill_indexer(target, method, limit) + indexer = self._get_fill_indexer(target, method, limit) elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for MultiIndex; see GitHub issue 9365') else: - indexer = self_index._engine.get_indexer(target._values) + # we may not compare equally because of hashing if we + # don't have the same dtypes + if self._inferred_type_levels != target._inferred_type_levels: + return Index(self.values).get_indexer(target.values) + + indexer = self._engine.get_indexer(target) return _ensure_platform_int(indexer) @@ -1571,17 +1696,6 @@ def reindex(self, target, method=None, level=None, limit=None, return target, indexer - @cache_readonly - def _tuple_index(self): - """ - Convert MultiIndex to an Index of tuples - - Returns - ------- - index : Index - """ - return Index(self._values) - def get_slice_bound(self, label, side, kind): if not isinstance(label, tuple): @@ -1828,8 +1942,9 @@ def partial_selection(key, indexer=None): key = tuple(self[indexer].tolist()[0]) - return (self._engine.get_loc(_values_from_object(key)), - None) + return (self._engine.get_loc( + _values_from_object(key)), None) + else: return partial_selection(key) else: @@ -2115,10 +2230,24 @@ def equals(self, other): return False for i in range(self.nlevels): + slabels = self.labels[i] + slabels = slabels[slabels != -1] svalues = algos.take_nd(np.asarray(self.levels[i]._values), - self.labels[i], allow_fill=False) + slabels, allow_fill=False) + + olabels = other.labels[i] + olabels = olabels[olabels != -1] ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - other.labels[i], allow_fill=False) + olabels, allow_fill=False) + + # since we use NaT both datetime64 and timedelta64 + # we can have a situation where a level is typed say + # timedelta64 in self (IOW it has other values than NaT) + # but types datetime64 in other (where its all NaT) + # but these are equivalent + if len(svalues) == 0 and len(ovalues) == 0: + continue + if not array_equivalent(svalues, ovalues): return False diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9224f7d3d9a94..d8de1dcd61977 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3787,9 +3787,9 @@ def read(self, where=None, columns=None, **kwargs): lp = DataFrame(c.data, index=long_index, columns=c.values) # need a better algorithm - tuple_index = long_index._tuple_index + tuple_index = long_index.values - unique_tuples = lib.fast_unique(tuple_index.values) + unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index 42089f9520ab6..b83dec1d26242 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -579,12 +579,12 @@ def get_dispatch(dtypes): {{for name, c_type, dtype in get_dispatch(dtypes)}} -cpdef ensure_{{name}}(object arr): +cpdef ensure_{{name}}(object arr, copy=True): if util.is_array(arr): if ( arr).descr.type_num == NPY_{{c_type}}: return arr else: - return arr.astype(np.{{dtype}}) + return arr.astype(np.{{dtype}}, copy=copy) else: return np.array(arr, dtype=np.{{dtype}}) diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index ef385ba7dca1c..3ce82dace40a9 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -262,13 +262,6 @@ cdef class {{name}}HashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, {{dtype}}_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): cdef: khiter_t k @@ -501,18 +494,6 @@ cdef class StringHashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef: - Py_ssize_t i, val - char *v - - v = util.get_c_string(key) - - for i in range(iterations): - k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k @@ -755,15 +736,6 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(val) - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k @@ -874,3 +846,127 @@ cdef class PyObjectHashTable(HashTable): count += 1 return np.asarray(labels) + + +cdef class MultiIndexHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_uint64() + self.mi = None + kh_resize_uint64(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + kh_destroy_uint64(self.table) + self.table = NULL + + def __len__(self): + return self.table.size + + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(uint64_t) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + + def _check_for_collisions(self, int64_t[:] locs, object mi): + # validate that the locs map to the actual values + # provided in the mi + # we can only check if we *don't* have any missing values + # :< + cdef: + ndarray[int64_t] alocs + + alocs = np.asarray(locs) + if (alocs != -1).all(): + + result = self.mi.take(locs) + if isinstance(mi, tuple): + from pandas import Index + mi = Index([mi]) + if not result.equals(mi): + raise AssertionError( + "hash collision\nlocs:\n{}\n" + "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + + def __contains__(self, object key): + try: + self.get_item(key) + return True + except (KeyError, ValueError, TypeError): + return False + + cpdef get_item(self, object key): + cdef: + khiter_t k + uint64_t value + int64_t[:] locs + Py_ssize_t loc + + value = self.mi._hashed_indexing_key(key) + k = kh_get_uint64(self.table, value) + if k != self.table.n_buckets: + loc = self.table.vals[k] + locs = np.array([loc], dtype=np.int64) + self._check_for_collisions(locs, key) + return loc + else: + raise KeyError(key) + + cpdef set_item(self, object key, Py_ssize_t val): + raise NotImplementedError + + @cython.boundscheck(False) + def map_locations(self, object mi): + cdef: + Py_ssize_t i, n + ndarray[uint64_t] values + uint64_t val + int ret = 0 + khiter_t k + + self.mi = mi + n = len(mi) + values = mi._hashed_values + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_uint64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, object mi): + # look up with a target mi + cdef: + Py_ssize_t i, n + ndarray[uint64_t] values + int ret = 0 + uint64_t val + khiter_t k + int64_t[:] locs + + n = len(mi) + values = mi._hashed_values + + locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_uint64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + self._check_for_collisions(locs, mi) + return np.asarray(locs) + + def unique(self, object mi): + raise NotImplementedError + + def get_labels(self, object mi, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + raise NotImplementedError diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 6b4c56747c981..fe3f3c554a9b5 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import print_function - +import pytest from pandas.compat import range, lrange import numpy as np -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, MultiIndex from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -165,6 +165,31 @@ def test_delitem(self): del self.frame['A'] self.assertNotIn('A', self.frame) + def test_delitem_multiindex(self): + midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + df = DataFrame(np.random.randn(4, 4), columns=midx) + assert len(df.columns) == 4 + assert ('A', ) in df.columns + assert 'A' in df.columns + + result = df['A'] + assert isinstance(result, DataFrame) + del df['A'] + + assert len(df.columns) == 2 + + # A still in the levels, BUT get a KeyError if trying + # to delete + assert ('A', ) not in df.columns + with pytest.raises(KeyError): + del df[('A',)] + + # xref: https://github.com/pandas-dev/pandas/issues/2770 + # the 'A' is STILL in the columns! + assert 'A' in df.columns + with pytest.raises(KeyError): + del df['A'] + def test_pop(self): self.frame.columns.name = 'baz' diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 2df297d03bcdf..024e11e63a924 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -301,10 +301,12 @@ def test_info_memory_usage(self): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() + # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " in res[-1]) + # do not display memory usage cas df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() @@ -312,11 +314,13 @@ def test_info_memory_usage(self): df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() + # memory usage is a lower bound, so print it as XYZ+ MB self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() + # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) @@ -380,6 +384,34 @@ def test_info_memory_usage(self): diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100) + def test_info_memory_usage_qualified(self): + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=[1, 2, 3]) + df.info(buf=buf) + self.assertFalse('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=list('ABC')) + df.info(buf=buf) + self.assertTrue('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=pd.MultiIndex.from_product( + [range(3), range(3)])) + df.info(buf=buf) + self.assertFalse('+' in buf.getvalue()) + + buf = StringIO() + df = DataFrame(1, columns=list('ab'), + index=pd.MultiIndex.from_product( + [range(3), ['foo', 'bar']])) + df.info(buf=buf) + self.assertTrue('+' in buf.getvalue()) + def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 # memory usage introspection should not materialize .values diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3a6a9eaaa8e72..d53446870beb1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1588,7 +1588,7 @@ def test_groupby_as_index_cython(self): result = grouped.mean() expected = data.groupby(['A', 'B']).mean() - arrays = lzip(*expected.index._tuple_index) + arrays = lzip(*expected.index.values) expected.insert(0, 'A', arrays[0]) expected.insert(1, 'B', arrays[1]) expected.index = np.arange(len(expected)) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 702c4758da245..5611492b4af1b 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1046,6 +1046,21 @@ def test_contains(self): self.assertNotIn(('bar', 'two'), self.index) self.assertNotIn(None, self.index) + def test_contains_top_level(self): + midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + assert 'A' in midx + assert 'A' not in midx._engine + + def test_contains_with_nat(self): + # MI with a NaT + mi = MultiIndex(levels=[['C'], + pd.date_range('2012-01-01', periods=5)], + labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, 'B']) + self.assertTrue(('C', pd.Timestamp('2012-01-01')) in mi) + for val in mi.values: + self.assertTrue(val in mi) + def test_is_all_dates(self): self.assertFalse(self.index.is_all_dates) @@ -1102,6 +1117,17 @@ def test_get_loc_duplicates(self): xp = 0 assert (rs == xp) + def test_get_value_duplicates(self): + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], + [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + + assert index.get_loc('D') == slice(0, 3) + with pytest.raises(KeyError): + index._engine.get_value(np.array([]), 'D') + def test_get_loc_level(self): index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( lrange(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array( @@ -1294,7 +1320,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rbfill1) # pass non-MultiIndex - r1 = idx1.get_indexer(idx2._tuple_index) + r1 = idx1.get_indexer(idx2.values) rexp1 = idx1.get_indexer(idx2) assert_almost_equal(r1, rexp1) @@ -1316,6 +1342,19 @@ def test_get_indexer_nearest(self): with tm.assertRaises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_hash_collisions(self): + # non-smoke test that we don't get hash collisions + + index = MultiIndex.from_product([np.arange(1000), np.arange(1000)], + names=['one', 'two']) + result = index.get_indexer(index.values) + self.assert_numpy_array_equal(result, + np.arange(len(index), dtype='int64')) + + for i in [0, 1, len(index) - 2, len(index) - 1]: + result = index.get_loc(index[i]) + self.assertEqual(result, i) + def test_format(self): self.index.format() self.index[:0].format() @@ -1420,12 +1459,13 @@ def test_bounds(self): self.index._bounds def test_equals_multi(self): - self.assertTrue(self.index.equals(self.index)) - self.assertTrue(self.index.equal_levels(self.index)) - - self.assertFalse(self.index.equals(self.index[:-1])) + assert self.index.equals(self.index) + assert not self.index.equals(self.index.values) + assert self.index.equals(Index(self.index.values)) - self.assertTrue(self.index.equals(self.index._tuple_index)) + assert self.index.equal_levels(self.index) + assert not self.index.equals(self.index[:-1]) + assert not self.index.equals(self.index[-1]) # different number of levels index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), Index( @@ -1433,8 +1473,8 @@ def test_equals_multi(self): [0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) index2 = MultiIndex(levels=index.levels[:-1], labels=index.labels[:-1]) - self.assertFalse(index.equals(index2)) - self.assertFalse(index.equal_levels(index2)) + assert not index.equals(index2) + assert not index.equal_levels(index2) # levels are different major_axis = Index(lrange(4)) @@ -1445,8 +1485,8 @@ def test_equals_multi(self): index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) - self.assertFalse(self.index.equals(index)) - self.assertFalse(self.index.equal_levels(index)) + assert not self.index.equals(index) + assert not self.index.equal_levels(index) # some of the labels are different major_axis = Index(['foo', 'bar', 'baz', 'qux']) @@ -1457,7 +1497,16 @@ def test_equals_multi(self): index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) - self.assertFalse(self.index.equals(index)) + assert not self.index.equals(index) + + def test_equals_missing_values(self): + # make sure take is not using -1 + i = pd.MultiIndex.from_tuples([(0, pd.NaT), + (0, pd.Timestamp('20130101'))]) + result = i[0:1].equals(i[0]) + self.assertFalse(result) + result = i[1:2].equals(i[1]) + self.assertFalse(result) def test_identical(self): mi = self.index.copy() @@ -1510,7 +1559,7 @@ def test_union(self): the_union = piece1 | piece2 - tups = sorted(self.index._tuple_index) + tups = sorted(self.index.values) expected = MultiIndex.from_tuples(tups) self.assertTrue(the_union.equals(expected)) @@ -1523,7 +1572,7 @@ def test_union(self): self.assertIs(the_union, self.index) # won't work in python 3 - # tuples = self.index._tuple_index + # tuples = self.index.values # result = self.index[:4] | tuples[4:] # self.assertTrue(result.equals(tuples)) @@ -1543,7 +1592,7 @@ def test_intersection(self): piece2 = self.index[3:] the_int = piece1 & piece2 - tups = sorted(self.index[3:5]._tuple_index) + tups = sorted(self.index[3:5].values) expected = MultiIndex.from_tuples(tups) self.assertTrue(the_int.equals(expected)) @@ -1557,7 +1606,7 @@ def test_intersection(self): self.assertTrue(empty.equals(expected)) # can't do in python 3 - # tuples = self.index._tuple_index + # tuples = self.index.values # result = self.index & tuples # self.assertTrue(result.equals(tuples)) @@ -1616,7 +1665,7 @@ def test_difference(self): self.assertEqual(len(result), 0) # raise Exception called with non-MultiIndex - result = first.difference(first._tuple_index) + result = first.difference(first.values) self.assertTrue(result.equals(first[:0])) # name from empty array @@ -1642,7 +1691,7 @@ def test_from_tuples(self): def test_argsort(self): result = self.index.argsort() - expected = self.index._tuple_index.argsort() + expected = self.index.values.argsort() tm.assert_numpy_array_equal(result, expected) def test_sortlevel(self): @@ -2297,11 +2346,60 @@ def test_level_setting_resets_attributes(self): ind = MultiIndex.from_arrays([ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] ]) - assert ind.is_monotonic + self.assertTrue(ind.is_monotonic) ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + # if this fails, probably didn't reset the cache correctly. - assert not ind.is_monotonic + self.assertFalse(ind.is_monotonic) + + def test_is_monotonic(self): + i = MultiIndex.from_product([np.arange(10), + np.arange(10)], names=['one', 'two']) + self.assertTrue(i.is_monotonic) + self.assertTrue(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([np.arange(10, 0, -1), + np.arange(10)], names=['one', 'two']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([np.arange(10), + np.arange(10, 0, -1)], + names=['one', 'two']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + # string ordering + i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.assertFalse(i.is_monotonic) + self.assertFalse(Index(i.values).is_monotonic) + + i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], + ['mom', 'next', 'zenith']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.assertTrue(i.is_monotonic) + self.assertTrue(Index(i.values).is_monotonic) + + # mixed levels, hits the TypeError + i = MultiIndex( + levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', + 'nl0000289783', + 'nl0000289965', 'nl0000301109']], + labels=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], + names=['household_id', 'asset_id']) + + self.assertFalse(i.is_monotonic) def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 1e6ecbbcdc756..b6b9ac93b234c 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -413,9 +413,10 @@ def f(): df.loc[idx[:, :, 'Stock'], 'price'] *= 2 tm.assert_frame_equal(df, expected) - def test_getitem_multiindex(self): + def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! + index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8e0628eefa392..0f36af2c8c4e7 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1469,7 +1469,7 @@ def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' - arrays = [np.array(x) for x in zip(*df.columns._tuple_index)] + arrays = [np.array(x) for x in zip(*df.columns.values)] result = df['foo'] result2 = df.loc[:, 'foo'] @@ -1493,7 +1493,7 @@ def test_series_getitem_not_sorted(self): index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) - arrays = [np.array(x) for x in zip(*index._tuple_index)] + arrays = [np.array(x) for x in zip(*index.values)] result = s['qux'] result2 = s.loc['qux'] diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 05a352f259e8b..9bed0d428bc41 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -152,6 +152,18 @@ def test_categorical_consistency(self): tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) + def test_categorical_with_nan_consistency(self): + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], + categories=pd.date_range('2012-01-01', periods=5, name='B')) + expected = hash_array(c, categorize=False) + c = pd.Categorical.from_codes( + [-1, 0], + categories=[pd.Timestamp('2012-01-01')]) + result = hash_array(c, categorize=False) + assert result[0] in expected + assert result[1] in expected + def test_pandas_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: diff --git a/pandas/tests/tools/test_join.py b/pandas/tests/tools/test_join.py index ab42b1212301b..ee6b3d57b852d 100644 --- a/pandas/tests/tools/test_join.py +++ b/pandas/tests/tools/test_join.py @@ -7,7 +7,7 @@ from pandas.compat import lrange import pandas.compat as compat from pandas.util.testing import assert_frame_equal -from pandas import DataFrame, MultiIndex, Series, merge, concat +from pandas import DataFrame, MultiIndex, Series, Index, merge, concat import pandas._join as _join import pandas.util.testing as tm @@ -368,7 +368,7 @@ def test_join_multiindex(self): df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index.union(index2._tuple_index) + ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) @@ -378,7 +378,7 @@ def test_join_multiindex(self): df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) - ex_index = index1._tuple_index.union(index2._tuple_index) + ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 800e0b8815443..ef863510cdd87 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -5,7 +5,6 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex -import pandas.core.algorithms as algos from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, @@ -142,20 +141,18 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): if not isinstance(vals, MultiIndex): vals = MultiIndex.from_tuples(vals) - # create a list-of-ndarrays - def get_level_values(num): - unique = vals.levels[num] # .values - labels = vals.labels[num] - filled = algos.take_1d(unique._values, labels, - fill_value=unique._na_value) - return filled - - vals = [get_level_values(level) + # create a list-of-Categoricals + vals = [Categorical(vals.labels[level], + vals.levels[level], + ordered=False, + fastpath=True) for level in range(vals.nlevels)] # hash the list-of-ndarrays - hashes = (hash_array(l, encoding=encoding, hash_key=hash_key) - for l in vals) + hashes = (_hash_categorical(cat, + encoding=encoding, + hash_key=hash_key) + for cat in vals) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] @@ -178,9 +175,26 @@ def _hash_categorical(c, encoding, hash_key): ------- ndarray of hashed values array, same size as len(c) """ - cat_hashed = hash_array(c.categories.values, encoding, hash_key, - categorize=False).astype(np.uint64, copy=False) - return c.rename_categories(cat_hashed).astype(np.uint64, copy=False) + hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False) + + # we have uint64, as we don't directly support missing values + # we don't want to use take_nd which will coerce to float + # instead, directly construt the result with a + # max(np.uint64) as the missing value indicator + # + # TODO: GH 15362 + + mask = c.isnull() + if len(hashed): + result = hashed.take(c.codes) + else: + result = np.zeros(len(mask), dtype='uint64') + + if mask.any(): + result[mask] = np.iinfo(np.uint64).max + + return result def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): diff --git a/pandas/types/cast.py b/pandas/types/cast.py index 6b1c3f9c00351..b1a17df64aecf 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -12,7 +12,8 @@ is_datetime64tz_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_dtype_equal, is_float_dtype, is_complex_dtype, - is_integer_dtype, is_datetime_or_timedelta_dtype, + is_integer_dtype, + is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, _string_dtypes, _coerce_to_dtype,