Skip to content

Commit

Permalink
PERF: high memory in MI
Browse files Browse the repository at this point in the history
closes #13904

Creates an efficient MultiIndexHashTable in cython.
This allows us to efficiently store a multi-index for fast indexing
(.get_loc() and .get_indexer()), with the current tuple-based
(and gil holding) use of the PyObject Hash Table.

This uses the pandas.tools.hashing routines to hash each of the 'values' of a MI
to a single uint64. So this makes MI more memory friendly and much
more efficient. You get these speedups, because the creation of the
hashtable is now much more efficient.

Author: Jeff Reback <jeff@reback.net>

Closes #15245 from jreback/mi and squashes the following commits:

7df6c34 [Jeff Reback] PERF: high memory in MI
  • Loading branch information
jreback committed Feb 15, 2017
1 parent b261dfe commit e351ed0
Show file tree
Hide file tree
Showing 22 changed files with 605 additions and 125 deletions.
30 changes: 28 additions & 2 deletions asv_bench/benchmarks/indexing.py
Expand Up @@ -88,7 +88,7 @@ def setup(self):

def time_getitem_scalar(self):
self.ts[self.dt]


class DataFrameIndexing(object):
goal_time = 0.2
Expand Down Expand Up @@ -189,6 +189,15 @@ def setup(self):
self.eps_C = 5
self.eps_D = 5000
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
self.miint = MultiIndex.from_product(
[np.arange(1000),
np.arange(1000)], names=['one', 'two'])

import string
self.mistring = MultiIndex.from_product(
[np.arange(1000),
np.arange(20), list(string.ascii_letters)],
names=['one', 'two', 'three'])

def time_series_xs_mi_ix(self):
self.s.ix[999]
Expand All @@ -197,7 +206,24 @@ def time_frame_xs_mi_ix(self):
self.df.ix[999]

def time_multiindex_slicers(self):
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
self.mdt2.loc[self.idx[
(self.test_A - self.eps_A):(self.test_A + self.eps_A),
(self.test_B - self.eps_B):(self.test_B + self.eps_B),
(self.test_C - self.eps_C):(self.test_C + self.eps_C),
(self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]

def time_multiindex_get_indexer(self):
self.miint.get_indexer(
np.array([(0, 10), (0, 11), (0, 12),
(0, 13), (0, 14), (0, 15),
(0, 16), (0, 17), (0, 18),
(0, 19)], dtype=object))

def time_multiindex_string_get_loc(self):
self.mistring.get_loc((999, 19, 'Z'))

def time_is_monotonic(self):
self.miint.is_monotonic


class PanelIndexing(object):
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/reindex.py
Expand Up @@ -16,8 +16,8 @@ def setup(self):
data=np.random.rand(10000, 30), columns=range(30))

# multi-index
N = 1000
K = 20
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
level2 = np.tile(tm.makeStringIndex(K).values, N)
index = MultiIndex.from_arrays([level1, level2])
Expand Down
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Expand Up @@ -472,7 +472,7 @@ Performance Improvements
- Improved performance of timeseries plotting with an irregular DatetimeIndex
(or with ``compat_x=True``) (:issue:`15073`).
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)

- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.


Expand Down Expand Up @@ -502,6 +502,8 @@ Bug Fixes
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)



- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`)
- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/algorithms.py
Expand Up @@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
indexer = np.arange(arr.shape[axis], dtype=np.int64)
dtype, fill_value = arr.dtype, arr.dtype.type()
else:
indexer = _ensure_int64(indexer)
indexer = _ensure_int64(indexer, copy=False)
if not allow_fill:
dtype, fill_value = arr.dtype, arr.dtype.type()
mask_info = None, False
Expand Down Expand Up @@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,

func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
mask_info=mask_info)
indexer = _ensure_int64(indexer)
func(arr, indexer, out, fill_value)

if flip_order:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Expand Up @@ -1752,7 +1752,8 @@ def _sizeof_fmt(num, size_qualifier):
# all cases (e.g., it misses categorical data even with object
# categories)
deep = False
if 'object' in counts or is_object_dtype(self.index):
if ('object' in counts or
self.index._is_memory_usage_qualified()):
size_qualifier = '+'
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append("memory usage: %s\n" %
Expand Down
8 changes: 8 additions & 0 deletions pandas/hashtable.pxd
Expand Up @@ -31,6 +31,14 @@ cdef class PyObjectHashTable(HashTable):
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class MultiIndexHashTable(HashTable):
cdef:
kh_uint64_t *table
object mi

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class StringHashTable(HashTable):
cdef kh_str_t *table

Expand Down
39 changes: 36 additions & 3 deletions pandas/index.pyx
Expand Up @@ -182,7 +182,7 @@ cdef class IndexEngine:
Py_ssize_t i, n
int last_true

values = self._get_index_values()
values = np.array(self._get_index_values(), copy=False)
n = len(values)

result = np.empty(n, dtype=bool)
Expand Down Expand Up @@ -284,7 +284,6 @@ cdef class IndexEngine:
if not self.is_mapping_populated:

values = self._get_index_values()

self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)

Expand Down Expand Up @@ -322,7 +321,7 @@ cdef class IndexEngine:
Py_ssize_t i, j, n, n_t, n_alloc

self._ensure_mapping_populated()
values = self._get_index_values()
values = np.array(self._get_index_values(), copy=False)
stargets = set(targets)
n = len(values)
n_t = len(targets)
Expand Down Expand Up @@ -554,5 +553,39 @@ cdef inline bint _is_utc(object tz):
return tz is UTC or isinstance(tz, _du_utc)


cdef class MultiIndexEngine(IndexEngine):

def _call_monotonic(self, object mi):
# defer these back to the mi iteself
return (mi.is_monotonic_increasing,
mi.is_monotonic_decreasing,
mi.is_unique)

def get_backfill_indexer(self, other, limit=None):
# we coerce to ndarray-of-tuples
values = np.array(self._get_index_values())
return algos.backfill_object(values, other, limit=limit)

def get_pad_indexer(self, other, limit=None):
# we coerce to ndarray-of-tuples
values = np.array(self._get_index_values())
return algos.pad_object(values, other, limit=limit)

cpdef get_loc(self, object val):
if is_definitely_invalid_key(val):
raise TypeError("'{val}' is an invalid key".format(val=val))

self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(val)

try:
return self.mapping.get_item(val)
except TypeError:
raise KeyError(val)

cdef _make_hash_table(self, n):
return _hash.MultiIndexHashTable(n)

# Generated from template.
include "index_class_helper.pxi"
5 changes: 4 additions & 1 deletion pandas/indexes/base.py
Expand Up @@ -1431,6 +1431,10 @@ def inferred_type(self):
""" return a string of the type inferred from the values """
return lib.infer_dtype(self)

def _is_memory_usage_qualified(self):
""" return a boolean if we need a qualified .info display """
return self.is_object()

def is_type_compatible(self, kind):
return kind == self.inferred_type

Expand Down Expand Up @@ -2446,7 +2450,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
'if index and target are monotonic' % method)

side = 'left' if method == 'pad' else 'right'
target = np.asarray(target)

# find exact matches first (this simplifies the algorithm)
indexer = self.get_indexer(target)
Expand Down

0 comments on commit e351ed0

Please sign in to comment.