Permalink
Browse files

ENH: cython refactor, pushing more index logic into engines.pyx, brok…

…en unit tests, a mess still
  • Loading branch information...
1 parent b50076e commit 5eecdb2dd94719e1fd097ce3fb046697445a3d7f @wesm wesm committed Apr 4, 2012
View
@@ -1,176 +0,0 @@
-import time
-
-import numpy as np
-
-from pandas import Series, Index, isnull
-import pandas.lib.tseries as tseries
-from pandas.util.testing import assert_almost_equal, assert_dict_equal
-
-def _timeit(f, n=10):
- _s = time.clock()
- for i in xrange(n):
- f()
-
- return (time.clock() - _s) / n
-
-def bench_reindex():
- K = 100000
- index = Index(np.arange(K))
- values = np.arange(float(K))
- obj_vals = values.astype(object)
-
- new_index = np.arange(K)
- np.random.shuffle(new_index)
- new_index = Index(new_index)
-
- f = lambda: tseries.reindex(new_index, values, index.indexMap)
- print 'tseries.reindex: %.2f ms per iteration' % (_timeit(f, n=50) * 1000)
-
- def _test():
- filler, mask = tseries.getMergeVec(new_index, index.indexMap)
- result = values.take(filler)
- np.putmask(result, -mask, np.NaN)
-
- return result
-
- timing = _timeit(_test, n=50) * 1000
- print 'getMergeVec method: %.2f ms per iteration' % timing
-
- f2 = lambda: tseries.reindexObj(new_index, values, index.indexMap)
- print ('tseries.reindexObj with floats: %.2f ms per iteration'
- % (_timeit(f2, n=50) * 1000))
-
- f3 = lambda: tseries.reindexObj(new_index, obj_vals, index.indexMap)
- print ('tseries.reindexObj with objects: %.2f ms per iteration'
- % (_timeit(f3, n=50) * 1000))
-
- f4 = lambda: tseries.reindexObject(new_index, obj_vals, index.indexMap)
- print ('tseries.reindexObject buffers: %.2f ms per iteration'
- % (_timeit(f4, n=50) * 1000))
-
- def _test2():
- filler, mask = tseries.getMergeVec(new_index, index.indexMap)
- result = obj_vals.take(filler)
- np.putmask(result, -mask, np.NaN)
-
- return result
-
- timing = _timeit(_test2, n=50) * 1000
- print 'getMergeVec method: %.2f ms per iteration' % timing
-
- assert_almost_equal(_test(), f())
- assert_almost_equal(f2(), f3())
- assert_almost_equal(f3(), f4())
- assert_almost_equal(f2(), f4())
- assert_almost_equal(f2(), _test2())
-
-
-def _isnan(obj):
- return obj != obj
-
-def test_groupby():
- mapping = Series({
- 1 : 2.,
- 2 : 2.,
- 3 : np.NaN,
- 4 : np.NaN,
- 5 : 3.,
- 6 : 3.,
- 7 : np.NaN
- })
-
- index = Index([1, 2, 3, 4, 5, 6, 7])
-
- expected = {
- 2 : [1, 2],
- 3 : [5, 6],
- np.NaN : [3, 4, 7]
- }
-
- def compare_with_null(d1, d2):
- d1_nulls = None
- d2_nulls = None
- for k, v in d1.iteritems():
- if _isnan(k):
- d1_nulls = v
- else:
- assert(k in d2)
- assert(np.array_equal(v, d2[k]))
-
- for k, v in d2.iteritems():
- if _isnan(k):
- d2_nulls = v
- else:
- assert(k in d1)
-
- if d1_nulls is not None or d2_nulls is not None:
- assert(np.array_equal(d1_nulls, d2_nulls))
-
- grouped = tseries.groupby(index, mapping.get)
- compare_with_null(grouped, expected)
-
-def groupby_nocython(index, mapper, output=None):
- if output is None:
- result = {}
- else:
- result = output
-
- index = np.asarray(index)
- mapped_index = np.array([mapper(x) for x in index])
-
- # A little hack here
- if issubclass(mapped_index.dtype.type, basestring):
- mapped_index = mapped_index.astype(object)
-
- mask = isnull(mapped_index)
- nullkeys = index[mask]
-
- if nullkeys is not None and len(nullkeys) > 0:
- result[np.NaN] = nullkeys
-
- notmask = -mask
- index = index[notmask]
- mapped_index = mapped_index[notmask]
-
- for idx, key in zip(index, mapped_index):
- result.setdefault(key, []).append(idx)
-
- return result
-
-def bench_groupby():
- N = 200
-
- arr = np.arange(10000).astype(object)
- values = np.random.randn(10000)
- keys = arr // 10
- d = dict(zip(arr, keys))
-
- f = lambda: groupby_nocython(arr, d.get)
- print 'no cython: %.2f ms per iteration' % (_timeit(f, n=N) * 1000)
-
- f = lambda: tseries.arrmap(arr, d.get)
- timing = _timeit(f, n=N) * 1000
- print 'arrmap: %.2f ms per iteration' % timing
-
- f = lambda: isnull(tseries.arrmap(arr, d.get))
- print 'isnull: %.2f ms per iteration' % (_timeit(f, n=N) * 1000 - timing)
-
- f = lambda: tseries.groupby(arr, d.get)
- print 'groupby: %.2f ms per iteration' % (_timeit(f, n=N) * 1000)
-
- f = lambda: tseries.groupby_indices(arr, d.get)
- print 'groupby_inds: %.2f ms per iteration' % (_timeit(f, n=N) * 1000)
-
- def _test():
- groups = tseries.groupby_indices(arr, d.get)
-
- result = {}
- for k, v in groups.iteritems():
- result[k] = np.mean(values.take(v))
-
- return result
-
- print 'test: %.2f ms per iteration' % (_timeit(_test, n=N) * 1000)
-
-def bench_map_indices():
- pass
View
@@ -3103,6 +3103,7 @@ def _apply_standard(self, func, axis, ignore_failures=False):
for k, v in series_gen:
results[k] = func(v)
except Exception, e:
+ raise # XXXXX
if hasattr(e, 'args'):
e.args = e.args + ('occurred at index %s' % str(k),)
raise
View
@@ -3,6 +3,7 @@
from datetime import time, datetime, date
from datetime import timedelta
from itertools import izip
+import weakref
import numpy as np
@@ -57,21 +58,26 @@ class Index(np.ndarray):
----
An Index instance can **only** contain hashable objects
"""
+ # _algos = {
+ # 'groupby' : _gin.groupby_index,
+ # }
+
+ # _map_indices = lib.map_indices_object
+
# Cython methods
- _map_indices = lib.map_indices_object
- _is_monotonic = lib.is_monotonic_object
_groupby = lib.groupby_object
_arrmap = lib.arrmap_object
_left_indexer = lib.left_join_indexer_object
_inner_indexer = lib.inner_join_indexer_object
_outer_indexer = lib.outer_join_indexer_object
- _merge_indexer = lib.merge_indexer_object
_pad = lib.pad_object
_backfill = lib.backfill_object
name = None
asi8 = None
+ _engine_type = _gin.ObjectEngine
+
def __new__(cls, data, dtype=None, copy=False, name=None):
if isinstance(data, np.ndarray):
if dtype is None:
@@ -164,25 +170,11 @@ def values(self):
@property
def is_monotonic(self):
- return self._monotonicity_check[0]
-
- @property
- def is_unique(self):
- is_unique = self._monotonicity_check[1]
-
- if is_unique is None:
- return self._engine.has_integrity
- else:
- return is_unique
+ return self._engine.is_monotonic
@cache_readonly
- def _monotonicity_check(self):
- try:
- # wrong buffer type raises ValueError
- is_monotonic, is_unique = self._is_monotonic(self.values)
- return is_monotonic, is_unique
- except TypeError:
- return False, None
+ def is_unique(self):
+ return self._engine.is_unique
def is_numeric(self):
return self.inferred_type in ['integer', 'floating']
@@ -210,20 +202,13 @@ def get_duplicates(self):
_get_duplicates = get_duplicates
- @property
- def indexMap(self):
- "{label -> location}"
- return self._engine.get_mapping(1)
-
def _cleanup(self):
self._engine.clear_mapping()
@cache_readonly
def _engine(self):
- import weakref
# property, for now, slow to look up
- return _gin.DictIndexEngine(weakref.ref(self),
- self._map_indices)
+ return self._engine_type(weakref.ref(self))
def _get_level_number(self, level):
if not isinstance(level, int):
@@ -574,7 +559,15 @@ def get_loc(self, key):
-------
loc : int
"""
- return self._engine.get_loc(key)
+ # TODO: push all of this into Cython
+ if self.is_unique:
+ return self._engine.get_loc(key)
+ elif self.is_monotonic:
+ left = self.searchsorted(key, side='left')
+ right = self.searchsorted(key, side='right')
+ return slice(left, right)
+ else:
+ return self.values == key
def get_value(self, series, key):
"""
@@ -664,7 +657,7 @@ def _get_indexer_standard(self, other):
self.is_monotonic and other.is_monotonic):
return self._left_indexer(other, self)
else:
- return self._merge_indexer(other, self.indexMap)
+ return self._engine.get_indexer(other)
def groupby(self, to_groupby):
return self._groupby(self.values, to_groupby)
@@ -995,16 +988,19 @@ def copy(self, order='C'):
class Int64Index(Index):
_map_indices = lib.map_indices_int64
- _is_monotonic = lib.is_monotonic_int64
+
+ # _is_monotonic = lib.is_monotonic_int64
+
_groupby = lib.groupby_int64
_arrmap = lib.arrmap_int64
_left_indexer = lib.left_join_indexer_int64
_inner_indexer = lib.inner_join_indexer_int64
_outer_indexer = lib.outer_join_indexer_int64
- _merge_indexer = lib.merge_indexer_int64
_pad = lib.pad_int64
_backfill = lib.backfill_int64
+ _engine_type = _gin.Int64Engine
+
def __new__(cls, data, dtype=None, copy=False, name=None):
if not isinstance(data, np.ndarray):
if np.isscalar(data):
@@ -1172,13 +1168,12 @@ class DatetimeIndex(Int64Index):
time on or just past end argument
"""
- _is_monotonic = _wrap_i8_function(lib.is_monotonic_int64)
+ # _is_monotonic = _wrap_i8_function(lib.is_monotonic_int64)
+
_inner_indexer = _join_i8_wrapper(lib.inner_join_indexer_int64)
_outer_indexer = _join_i8_wrapper(lib.outer_join_indexer_int64)
_left_indexer = _join_i8_wrapper(lib.left_join_indexer_int64,
with_indexers=False)
- _merge_indexer = _join_i8_wrapper(lib.merge_indexer_int64,
- with_indexers=False)
_map_indices = _wrap_i8_function(lib.map_indices_int64)
_pad = _wrap_i8_function(lib.pad_int64)
_backfill = _wrap_i8_function(lib.backfill_int64)
@@ -1199,6 +1194,10 @@ class DatetimeIndex(Int64Index):
# structured array cache for datetime fields
_sarr_cache = None
+ _engine_type = _gin.DatetimeEngine
+
+ offset = None
+
def __new__(cls, data=None,
freq=None, start=None, end=None, periods=None,
dtype=None, copy=False, name=None, tz=None,
@@ -1437,7 +1436,6 @@ def asobject(self):
"""
Unbox to an index of type object
"""
- offset = getattr(self, 'offset', None)
boxed_values = _dt_box_array(self.asi8, self.offset, self.tz)
return Index(boxed_values, dtype=object)
@@ -1656,7 +1654,7 @@ def get_value(self, series, key):
know what you're doing
"""
try:
- return super(DatetimeIndex, self).get_value(series, key)
+ return Index.get_value(self, series, key)
except KeyError:
try:
asdt, parsed, reso = datetools.parse_time_string(key)
@@ -1698,8 +1696,7 @@ def __getitem__(self, key):
arr_idx = self.view(np.ndarray)
if np.isscalar(key):
val = arr_idx[key]
- offset = getattr(self, 'offset', None)
- return _dt_box(val, offset=offset, tz=self.tz)
+ return _dt_box(val, offset=self.offset, tz=self.tz)
else:
new_offset = None
if (type(key) == slice):
@@ -1814,11 +1811,6 @@ def dtype(self):
def is_all_dates(self):
return True
- @cache_readonly
- def _engine(self):
- mapping = lib.map_indices_int64
- return _gin.DictIndexEngineDatetime(self.asi8, mapping)
-
def equals(self, other):
"""
Determines if two Index objects contain the same elements.
@@ -2977,7 +2969,7 @@ def get_indexer(self, target, method=None):
indexer = self._backfill(self_index, target_index,
self_index.indexMap, target.indexMap)
else:
- indexer = self._merge_indexer(target_index, self_index.indexMap)
+ indexer = self._engine.get_indexer(target_index)
return indexer
Oops, something went wrong.

0 comments on commit 5eecdb2

Please sign in to comment.