Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

ENH: cython refactor, pushing more index logic into engines.pyx, brok…

…en unit tests, a mess still
  • Loading branch information...
commit 5eecdb2dd94719e1fd097ce3fb046697445a3d7f 1 parent b50076e
@wesm wesm authored
View
176 bench/bench_tseries.py
@@ -1,176 +0,0 @@
-import time
-
-import numpy as np
-
-from pandas import Series, Index, isnull
-import pandas.lib.tseries as tseries
-from pandas.util.testing import assert_almost_equal, assert_dict_equal
-
-def _timeit(f, n=10):
- _s = time.clock()
- for i in xrange(n):
- f()
-
- return (time.clock() - _s) / n
-
-def bench_reindex():
- K = 100000
- index = Index(np.arange(K))
- values = np.arange(float(K))
- obj_vals = values.astype(object)
-
- new_index = np.arange(K)
- np.random.shuffle(new_index)
- new_index = Index(new_index)
-
- f = lambda: tseries.reindex(new_index, values, index.indexMap)
- print 'tseries.reindex: %.2f ms per iteration' % (_timeit(f, n=50) * 1000)
-
- def _test():
- filler, mask = tseries.getMergeVec(new_index, index.indexMap)
- result = values.take(filler)
- np.putmask(result, -mask, np.NaN)
-
- return result
-
- timing = _timeit(_test, n=50) * 1000
- print 'getMergeVec method: %.2f ms per iteration' % timing
-
- f2 = lambda: tseries.reindexObj(new_index, values, index.indexMap)
- print ('tseries.reindexObj with floats: %.2f ms per iteration'
- % (_timeit(f2, n=50) * 1000))
-
- f3 = lambda: tseries.reindexObj(new_index, obj_vals, index.indexMap)
- print ('tseries.reindexObj with objects: %.2f ms per iteration'
- % (_timeit(f3, n=50) * 1000))
-
- f4 = lambda: tseries.reindexObject(new_index, obj_vals, index.indexMap)
- print ('tseries.reindexObject buffers: %.2f ms per iteration'
- % (_timeit(f4, n=50) * 1000))
-
- def _test2():
- filler, mask = tseries.getMergeVec(new_index, index.indexMap)
- result = obj_vals.take(filler)
- np.putmask(result, -mask, np.NaN)
-
- return result
-
- timing = _timeit(_test2, n=50) * 1000
- print 'getMergeVec method: %.2f ms per iteration' % timing
-
- assert_almost_equal(_test(), f())
- assert_almost_equal(f2(), f3())
- assert_almost_equal(f3(), f4())
- assert_almost_equal(f2(), f4())
- assert_almost_equal(f2(), _test2())
-
-
-def _isnan(obj):
- return obj != obj
-
-def test_groupby():
- mapping = Series({
- 1 : 2.,
- 2 : 2.,
- 3 : np.NaN,
- 4 : np.NaN,
- 5 : 3.,
- 6 : 3.,
- 7 : np.NaN
- })
-
- index = Index([1, 2, 3, 4, 5, 6, 7])
-
- expected = {
- 2 : [1, 2],
- 3 : [5, 6],
- np.NaN : [3, 4, 7]
- }
-
- def compare_with_null(d1, d2):
- d1_nulls = None
- d2_nulls = None
- for k, v in d1.iteritems():
- if _isnan(k):
- d1_nulls = v
- else:
- assert(k in d2)
- assert(np.array_equal(v, d2[k]))
-
- for k, v in d2.iteritems():
- if _isnan(k):
- d2_nulls = v
- else:
- assert(k in d1)
-
- if d1_nulls is not None or d2_nulls is not None:
- assert(np.array_equal(d1_nulls, d2_nulls))
-
- grouped = tseries.groupby(index, mapping.get)
- compare_with_null(grouped, expected)
-
-def groupby_nocython(index, mapper, output=None):
- if output is None:
- result = {}
- else:
- result = output
-
- index = np.asarray(index)
- mapped_index = np.array([mapper(x) for x in index])
-
- # A little hack here
- if issubclass(mapped_index.dtype.type, basestring):
- mapped_index = mapped_index.astype(object)
-
- mask = isnull(mapped_index)
- nullkeys = index[mask]
-
- if nullkeys is not None and len(nullkeys) > 0:
- result[np.NaN] = nullkeys
-
- notmask = -mask
- index = index[notmask]
- mapped_index = mapped_index[notmask]
-
- for idx, key in zip(index, mapped_index):
- result.setdefault(key, []).append(idx)
-
- return result
-
-def bench_groupby():
- N = 200
-
- arr = np.arange(10000).astype(object)
- values = np.random.randn(10000)
- keys = arr // 10
- d = dict(zip(arr, keys))
-
- f = lambda: groupby_nocython(arr, d.get)
- print 'no cython: %.2f ms per iteration' % (_timeit(f, n=N) * 1000)
-
- f = lambda: tseries.arrmap(arr, d.get)
- timing = _timeit(f, n=N) * 1000
- print 'arrmap: %.2f ms per iteration' % timing
-
- f = lambda: isnull(tseries.arrmap(arr, d.get))
- print 'isnull: %.2f ms per iteration' % (_timeit(f, n=N) * 1000 - timing)
-
- f = lambda: tseries.groupby(arr, d.get)
- print 'groupby: %.2f ms per iteration' % (_timeit(f, n=N) * 1000)
-
- f = lambda: tseries.groupby_indices(arr, d.get)
- print 'groupby_inds: %.2f ms per iteration' % (_timeit(f, n=N) * 1000)
-
- def _test():
- groups = tseries.groupby_indices(arr, d.get)
-
- result = {}
- for k, v in groups.iteritems():
- result[k] = np.mean(values.take(v))
-
- return result
-
- print 'test: %.2f ms per iteration' % (_timeit(_test, n=N) * 1000)
-
-def bench_map_indices():
- pass
View
1  pandas/core/frame.py
@@ -3103,6 +3103,7 @@ def _apply_standard(self, func, axis, ignore_failures=False):
for k, v in series_gen:
results[k] = func(v)
except Exception, e:
+ raise # XXXXX
if hasattr(e, 'args'):
e.args = e.args + ('occurred at index %s' % str(k),)
raise
View
82 pandas/core/index.py
@@ -3,6 +3,7 @@
from datetime import time, datetime, date
from datetime import timedelta
from itertools import izip
+import weakref
import numpy as np
@@ -57,21 +58,26 @@ class Index(np.ndarray):
----
An Index instance can **only** contain hashable objects
"""
+ # _algos = {
+ # 'groupby' : _gin.groupby_index,
+ # }
+
+ # _map_indices = lib.map_indices_object
+
# Cython methods
- _map_indices = lib.map_indices_object
- _is_monotonic = lib.is_monotonic_object
_groupby = lib.groupby_object
_arrmap = lib.arrmap_object
_left_indexer = lib.left_join_indexer_object
_inner_indexer = lib.inner_join_indexer_object
_outer_indexer = lib.outer_join_indexer_object
- _merge_indexer = lib.merge_indexer_object
_pad = lib.pad_object
_backfill = lib.backfill_object
name = None
asi8 = None
+ _engine_type = _gin.ObjectEngine
+
def __new__(cls, data, dtype=None, copy=False, name=None):
if isinstance(data, np.ndarray):
if dtype is None:
@@ -164,25 +170,11 @@ def values(self):
@property
def is_monotonic(self):
- return self._monotonicity_check[0]
-
- @property
- def is_unique(self):
- is_unique = self._monotonicity_check[1]
-
- if is_unique is None:
- return self._engine.has_integrity
- else:
- return is_unique
+ return self._engine.is_monotonic
@cache_readonly
- def _monotonicity_check(self):
- try:
- # wrong buffer type raises ValueError
- is_monotonic, is_unique = self._is_monotonic(self.values)
- return is_monotonic, is_unique
- except TypeError:
- return False, None
+ def is_unique(self):
+ return self._engine.is_unique
def is_numeric(self):
return self.inferred_type in ['integer', 'floating']
@@ -210,20 +202,13 @@ def get_duplicates(self):
_get_duplicates = get_duplicates
- @property
- def indexMap(self):
- "{label -> location}"
- return self._engine.get_mapping(1)
-
def _cleanup(self):
self._engine.clear_mapping()
@cache_readonly
def _engine(self):
- import weakref
# property, for now, slow to look up
- return _gin.DictIndexEngine(weakref.ref(self),
- self._map_indices)
+ return self._engine_type(weakref.ref(self))
def _get_level_number(self, level):
if not isinstance(level, int):
@@ -574,7 +559,15 @@ def get_loc(self, key):
-------
loc : int
"""
- return self._engine.get_loc(key)
+ # TODO: push all of this into Cython
+ if self.is_unique:
+ return self._engine.get_loc(key)
+ elif self.is_monotonic:
+ left = self.searchsorted(key, side='left')
+ right = self.searchsorted(key, side='right')
+ return slice(left, right)
+ else:
+ return self.values == key
def get_value(self, series, key):
"""
@@ -664,7 +657,7 @@ def _get_indexer_standard(self, other):
self.is_monotonic and other.is_monotonic):
return self._left_indexer(other, self)
else:
- return self._merge_indexer(other, self.indexMap)
+ return self._engine.get_indexer(other)
def groupby(self, to_groupby):
return self._groupby(self.values, to_groupby)
@@ -995,16 +988,19 @@ def copy(self, order='C'):
class Int64Index(Index):
_map_indices = lib.map_indices_int64
- _is_monotonic = lib.is_monotonic_int64
+
+ # _is_monotonic = lib.is_monotonic_int64
+
_groupby = lib.groupby_int64
_arrmap = lib.arrmap_int64
_left_indexer = lib.left_join_indexer_int64
_inner_indexer = lib.inner_join_indexer_int64
_outer_indexer = lib.outer_join_indexer_int64
- _merge_indexer = lib.merge_indexer_int64
_pad = lib.pad_int64
_backfill = lib.backfill_int64
+ _engine_type = _gin.Int64Engine
+
def __new__(cls, data, dtype=None, copy=False, name=None):
if not isinstance(data, np.ndarray):
if np.isscalar(data):
@@ -1172,13 +1168,12 @@ class DatetimeIndex(Int64Index):
time on or just past end argument
"""
- _is_monotonic = _wrap_i8_function(lib.is_monotonic_int64)
+ # _is_monotonic = _wrap_i8_function(lib.is_monotonic_int64)
+
_inner_indexer = _join_i8_wrapper(lib.inner_join_indexer_int64)
_outer_indexer = _join_i8_wrapper(lib.outer_join_indexer_int64)
_left_indexer = _join_i8_wrapper(lib.left_join_indexer_int64,
with_indexers=False)
- _merge_indexer = _join_i8_wrapper(lib.merge_indexer_int64,
- with_indexers=False)
_map_indices = _wrap_i8_function(lib.map_indices_int64)
_pad = _wrap_i8_function(lib.pad_int64)
_backfill = _wrap_i8_function(lib.backfill_int64)
@@ -1199,6 +1194,10 @@ class DatetimeIndex(Int64Index):
# structured array cache for datetime fields
_sarr_cache = None
+ _engine_type = _gin.DatetimeEngine
+
+ offset = None
+
def __new__(cls, data=None,
freq=None, start=None, end=None, periods=None,
dtype=None, copy=False, name=None, tz=None,
@@ -1437,7 +1436,6 @@ def asobject(self):
"""
Unbox to an index of type object
"""
- offset = getattr(self, 'offset', None)
boxed_values = _dt_box_array(self.asi8, self.offset, self.tz)
return Index(boxed_values, dtype=object)
@@ -1656,7 +1654,7 @@ def get_value(self, series, key):
know what you're doing
"""
try:
- return super(DatetimeIndex, self).get_value(series, key)
+ return Index.get_value(self, series, key)
except KeyError:
try:
asdt, parsed, reso = datetools.parse_time_string(key)
@@ -1698,8 +1696,7 @@ def __getitem__(self, key):
arr_idx = self.view(np.ndarray)
if np.isscalar(key):
val = arr_idx[key]
- offset = getattr(self, 'offset', None)
- return _dt_box(val, offset=offset, tz=self.tz)
+ return _dt_box(val, offset=self.offset, tz=self.tz)
else:
new_offset = None
if (type(key) == slice):
@@ -1814,11 +1811,6 @@ def dtype(self):
def is_all_dates(self):
return True
- @cache_readonly
- def _engine(self):
- mapping = lib.map_indices_int64
- return _gin.DictIndexEngineDatetime(self.asi8, mapping)
-
def equals(self, other):
"""
Determines if two Index objects contain the same elements.
@@ -2977,7 +2969,7 @@ def get_indexer(self, target, method=None):
indexer = self._backfill(self_index, target_index,
self_index.indexMap, target.indexMap)
else:
- indexer = self._merge_indexer(target_index, self_index.indexMap)
+ indexer = self._engine.get_indexer(target_index)
return indexer
View
1  pandas/core/series.py
@@ -2602,7 +2602,6 @@ def _resolve_offset(freq, kwds):
if warn:
import warnings
- foo
warnings.warn("'timeRule' and 'offset' parameters are deprecated,"
" please use 'freq' instead",
FutureWarning)
View
1  pandas/src/data_algos.pyx
@@ -0,0 +1 @@
+
View
310 pandas/src/engines.pyx
@@ -1,5 +1,12 @@
from numpy cimport ndarray
+
+cdef inline is_definitely_invalid_key(object val):
+ return PySlice_Check(val) or cnp.PyArray_Check(val)
+
+from numpy cimport float64_t, int32_t, int64_t, uint8_t
+cimport cython
+
cimport numpy as cnp
cnp.import_array()
@@ -9,6 +16,9 @@ cimport util
import numpy as np
+import _tseries
+
+include "hashtable.pyx"
cdef extern from "datetime.h":
bint PyDateTime_Check(object o)
@@ -27,6 +37,26 @@ def set_value_at(ndarray arr, object loc, object val):
cdef class IndexEngine:
+ cdef readonly:
+ object index_weakref
+ HashTable mapping
+
+ cdef:
+ bint unique, monotonic
+ bint initialized, monotonic_check, unique_check
+
+ def __init__(self, index_weakref):
+ self.index_weakref = index_weakref
+ self.initialized = 0
+ self.monotonic_check = 0
+
+ self.unique = 0
+ self.monotonic = 0
+
+ def __contains__(self, object val):
+ self._ensure_mapping_populated()
+ return val in self.mapping
+
cpdef get_value(self, ndarray arr, object key):
'''
arr : 1-dimensional ndarray
@@ -49,77 +79,178 @@ cdef class IndexEngine:
loc = self.get_loc(key)
util.set_value_at(arr, loc, value)
-cdef class DictIndexEngine(IndexEngine):
- '''
- For accelerating low-level internal details of indexing
- '''
+ property is_unique:
- cdef readonly:
- object index_weakref
- dict mapping
- object mapfun
+ def __get__(self):
+ if not self.unique_check:
+ self._do_unique_check()
- cdef:
- bint initialized, integrity
+ return self.unique == 1
- def __init__(self, index_weakref, object mapfun):
- self.index_weakref = index_weakref
- self.initialized = 0
- self.integrity = 0
- self.mapfun = mapfun
+ property is_monotonic:
- def __contains__(self, object val):
- self._ensure_initialized()
- return val in self.mapping
+ def __get__(self):
+ if not self.monotonic_check:
+ self._do_monotonic_check()
- cpdef get_mapping(self, bint check_integrity):
- self._ensure_initialized()
- if check_integrity and self.integrity == 0:
- raise Exception('Index cannot contain duplicate values!')
+ return self.monotonic == 1
- return self.mapping
+ cdef inline _do_monotonic_check(self):
+ try:
+ values = self._get_index_values()
+ self.monotonic, self.unique = self._call_monotonic(values)
+ except TypeError:
+ self.monotonic = 0
+ self.monotonic_check = 1
- def clear_mapping(self):
- self.mapping = None
- self.initialized = 0
- self.integrity = 0
+ cdef _get_index_values(self):
+ return self.index_weakref().values
- cdef inline _ensure_initialized(self):
- if not self.initialized:
- self.initialize()
+ cdef inline _do_unique_check(self):
+ self._ensure_mapping_populated()
- property mapping_prop:
+ def _call_monotonic(self, values):
+ raise NotImplementedError
- def __get__(self):
- self._ensure_initialized()
- return self.mapfun
+ cdef _make_hash_table(self, n):
+ raise NotImplementedError
- property has_integrity:
+ cpdef get_loc(self, object val):
+ if is_definitely_invalid_key(val):
+ raise TypeError
- def __get__(self):
- self._ensure_initialized()
- return self.integrity == 1
+ self._ensure_mapping_populated()
+ if not self.unique:
+ raise Exception('Index values are not unique')
+
+ return self.mapping.get_item(val)
+
+ cdef inline _ensure_mapping_populated(self):
+ if not self.initialized:
+ self.initialize()
cdef initialize(self):
- values = self.index_weakref().values
- self.mapping = self.mapfun(values)
+ values = self._get_index_values()
+
+ self.mapping = self._make_hash_table(len(values))
+ self.mapping.map_locations(values)
+
if len(self.mapping) == len(values):
- self.integrity = 1
+ self.unique = 1
+ self.unique_check = 1
+
self.initialized = 1
- cpdef get_loc(self, object val):
- if is_definitely_invalid_key(val):
- raise TypeError
+ def clear_mapping(self):
+ self.mapping = None
+ self.initialized = 0
+
+ def get_indexer(self, values):
+ self._ensure_mapping_populated()
+ return self.mapping.lookup(values)
+
+ def get_pad_indexer(self, values):
+ pass
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_object(ndarray[object] old, ndarray[object] new, limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef object cur, prev
+ cdef int lim
+
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
+
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
+
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
+
+ i = nleft - 1
+ j = nright - 1
+
+ cur = old[nleft - 1]
+ while True:
+ if j == 0:
+ break
+
+ if i == 0:
+ while j > 0 and new[j] >= cur:
+ indexer[j] = i
+ j -= 1
+ break
+
+ prev = old[i - 1]
+
+ while j > 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
+
+ i -= 1
+ cur = prev
+
+ return indexer
+
+
+
+# @cache_readonly
+# def _monotonicity_check(self):
+# try:
+# f = self._algos['is_monotonic']
+# # wrong buffer type raises ValueError
+# return f(self.values)
+# except TypeError:
+# return False, None
- self._ensure_initialized()
- if not self.integrity:
- raise Exception('Index values are not unique')
- return self.mapping[val]
-cdef class DictIndexEngineDatetime(DictIndexEngine):
+
+cdef class Int64Engine(IndexEngine):
+
+ # cdef Int64HashTable mapping
+
+ cdef _make_hash_table(self, n):
+ return Int64HashTable(n)
+
+ def _call_monotonic(self, values):
+ return _tseries.is_monotonic_int64(values)
+
+cdef class Float64Engine(IndexEngine):
+
+ # cdef Float64HashTable mapping
+
+ cdef _make_hash_table(self, n):
+ return Float64HashTable(n)
+
+ def _call_monotonic(self, values):
+ return _tseries.is_monotonic_float64(values)
+
+
+cdef class ObjectEngine(IndexEngine):
+
+ # cdef PyObjectHashTable mapping
+
+ cdef _make_hash_table(self, n):
+ return PyObjectHashTable(n)
+
+ def _call_monotonic(self, values):
+ return _tseries.is_monotonic_object(values)
+
+
+cdef class DatetimeEngine(IndexEngine):
+
+ # cdef Int64HashTable mapping
def __contains__(self, object val):
- self._ensure_initialized()
+ self._ensure_mapping_populated()
if util.is_datetime64_object(val):
return val.view('i8') in self.mapping
@@ -130,12 +261,18 @@ cdef class DictIndexEngineDatetime(DictIndexEngine):
return val in self.mapping
+ cdef _make_hash_table(self, n):
+ return Int64HashTable(n)
+
+ cdef _get_index_values(self):
+ return self.index_weakref().values.view('i8')
+
cpdef get_loc(self, object val):
if is_definitely_invalid_key(val):
raise TypeError
- self._ensure_initialized()
- if not self.integrity:
+ self._ensure_mapping_populated()
+ if not self.unique:
raise Exception('Index values are not unique')
if util.is_datetime64_object(val):
@@ -147,13 +284,64 @@ cdef class DictIndexEngineDatetime(DictIndexEngine):
return self.mapping[val]
- cdef initialize(self):
- # already passed a view on ndarray
- values = self.index_weakref
- self.mapping = self.mapfun(values)
- if len(self.mapping) == len(values):
- self.integrity = 1
- self.initialized = 1
-cdef inline is_definitely_invalid_key(object val):
- return PySlice_Check(val) or cnp.PyArray_Check(val)
+# ctypedef fused idxvalue_t:
+# object
+# int
+# float64_t
+# int32_t
+# int64_t
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def is_monotonic(ndarray[idxvalue_t] arr):
+# '''
+# Returns
+# -------
+# is_monotonic, is_unique
+# '''
+# cdef:
+# Py_ssize_t i, n
+# idxvalue_t prev, cur
+# bint is_unique = 1
+
+# n = len(arr)
+
+# if n < 2:
+# return True, True
+
+# prev = arr[0]
+# for i in range(1, n):
+# cur = arr[i]
+# if cur < prev:
+# return False, None
+# elif cur == prev:
+# is_unique = 0
+# prev = cur
+# return True, is_unique
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def groupby_index(ndarray[idxvalue_t] index, ndarray labels):
+# cdef dict result = {}
+# cdef Py_ssize_t i, length
+# cdef list members
+# cdef object idx, key
+
+# length = len(index)
+
+# for i in range(length):
+# key = util.get_value_1d(labels, i)
+
+# if util._checknull(key):
+# continue
+
+# idx = index[i]
+# if key in result:
+# members = result[key]
+# members.append(idx)
+# else:
+# result[key] = [idx]
+
+# return result
View
238 pandas/src/generate_code.py
@@ -124,26 +124,6 @@ def set_na_2d(na = "NaN"):
raise_on_na = "raise ValueError('No NA values allowed')"
-merge_indexer_template = """@cython.wraparound(False)
-@cython.boundscheck(False)
-def merge_indexer_%(name)s(ndarray[%(c_type)s] values, dict oldMap):
- cdef Py_ssize_t i, j, length, newLength
- cdef %(c_type)s idx
- cdef ndarray[int32_t] fill_vec
-
- newLength = len(values)
- fill_vec = np.empty(newLength, dtype=np.int32)
- for i in range(newLength):
- idx = values[i]
- if idx in oldMap:
- fill_vec[i] = oldMap[idx]
- else:
- fill_vec[i] = -1
-
- return fill_vec
-
-"""
-
'''
Backfilling logic for generating fill vector
@@ -171,55 +151,55 @@ def merge_indexer_%(name)s(ndarray[%(c_type)s] values, dict oldMap):
backfill_template = """@cython.boundscheck(False)
@cython.wraparound(False)
-def backfill_%(name)s(ndarray[%(c_type)s] oldIndex,
- ndarray[%(c_type)s] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef %(c_type)s prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
-
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
-
- if oldLength == 0 or newLength == 0:
- return fill_vec
+def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef %(c_type)s cur, prev
+ cdef int lim
+
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
+
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- oldPos = oldLength - 1
- newPos = newLength - 1
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
- if newIndex[0] > oldIndex[oldLength - 1]:
- return fill_vec
+ i = nleft - 1
+ j = nright - 1
- while newPos >= 0:
- curOld = oldIndex[oldPos]
+ cur = old[nleft - 1]
- while newIndex[newPos] > curOld:
- newPos -= 1
- if newPos < 0:
- break
+ while j >= 0 and new[j] > cur:
+ j -= 1
- curLoc = oldMap[curOld]
+ while True:
+ if j == 0:
+ break
- if oldPos == 0:
- if newIndex[newPos] <= curOld:
- fill_vec[:newPos + 1] = curLoc
+ if i == 0:
+ while j >= 0 and new[j] <= cur:
+ indexer[j] = i
+ j -= 1
break
- else:
- prevOld = oldIndex[oldPos - 1]
- while newIndex[newPos] > prevOld:
- fill_vec[newPos] = curLoc
+ prev = old[i - 1]
+
+ while j >= 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
- newPos -= 1
- if newPos < 0:
- break
- oldPos -= 1
+ i -= 1
+ cur = prev
- return fill_vec
+ return indexer
"""
@@ -248,121 +228,54 @@ def backfill_%(name)s(ndarray[%(c_type)s] oldIndex,
pad_template = """@cython.boundscheck(False)
@cython.wraparound(False)
-def pad_%(name)s(ndarray[%(c_type)s] oldIndex,
- ndarray[%(c_type)s] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef %(c_type)s prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
-
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
-
- if oldLength == 0 or newLength == 0:
- return fill_vec
-
- oldPos = 0
- newPos = 0
+def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef %(c_type)s cur, next
+ cdef int lim
+
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
+
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
- while newPos < newLength:
- curOld = oldIndex[oldPos]
+ i = j = 0
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
+ cur = old[0]
- curLoc = oldMap[curOld]
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ while True:
+ if j == nright - 1:
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
-
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
- if newPos > newLength - 1:
- done = 1
- break
-
- if done:
- break
-
- oldPos += 1
-
- return fill_vec
-
-"""
-
-pad_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_%(name)s(ndarray[%(c_type)s] oldIndex,
- ndarray[%(c_type)s] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef %(c_type)s prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
-
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
-
- if oldLength == 0 or newLength == 0:
- return fill_vec
-
- oldPos = 0
- newPos = 0
-
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
-
- while newPos < newLength:
- curOld = oldIndex[oldPos]
-
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
-
- curLoc = oldMap[curOld]
-
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ if i == nleft - 1:
+ while j < nright and new[j] >= cur:
+ indexer[j] = i
+ j += 1
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
-
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
- if newPos > newLength - 1:
- done = 1
- break
+ next = old[i + 1]
- if done:
- break
+ while j < nright and cur <= new[j] < next:
+ indexer[j] = i
+ j += 1
- oldPos += 1
+ i += 1
+ cur = next
- return fill_vec
+ return indexer
"""
@@ -777,7 +690,6 @@ def generate_from_template(template, ndim=1, exclude=None):
return output.getvalue()
templates_1d = [map_indices_template,
- merge_indexer_template,
pad_template,
backfill_template,
pad_2d_template,
View
844 pandas/src/generated.pyx
@@ -109,647 +109,521 @@ cpdef map_indices_bool(ndarray[uint8_t] index):
return result
-@cython.wraparound(False)
@cython.boundscheck(False)
-def merge_indexer_float64(ndarray[float64_t] values, dict oldMap):
- cdef Py_ssize_t i, j, length, newLength
- cdef float64_t idx
- cdef ndarray[int32_t] fill_vec
-
- newLength = len(values)
- fill_vec = np.empty(newLength, dtype=np.int32)
- for i in range(newLength):
- idx = values[i]
- if idx in oldMap:
- fill_vec[i] = oldMap[idx]
- else:
- fill_vec[i] = -1
-
- return fill_vec
-
@cython.wraparound(False)
-@cython.boundscheck(False)
-def merge_indexer_object(ndarray[object] values, dict oldMap):
- cdef Py_ssize_t i, j, length, newLength
- cdef object idx
- cdef ndarray[int32_t] fill_vec
+def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef float64_t cur, next
+ cdef int lim
- newLength = len(values)
- fill_vec = np.empty(newLength, dtype=np.int32)
- for i in range(newLength):
- idx = values[i]
- if idx in oldMap:
- fill_vec[i] = oldMap[idx]
- else:
- fill_vec[i] = -1
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- return fill_vec
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def merge_indexer_int32(ndarray[int32_t] values, dict oldMap):
- cdef Py_ssize_t i, j, length, newLength
- cdef int32_t idx
- cdef ndarray[int32_t] fill_vec
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
- newLength = len(values)
- fill_vec = np.empty(newLength, dtype=np.int32)
- for i in range(newLength):
- idx = values[i]
- if idx in oldMap:
- fill_vec[i] = oldMap[idx]
- else:
- fill_vec[i] = -1
+ i = j = 0
- return fill_vec
+ cur = old[0]
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def merge_indexer_int64(ndarray[int64_t] values, dict oldMap):
- cdef Py_ssize_t i, j, length, newLength
- cdef int64_t idx
- cdef ndarray[int32_t] fill_vec
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
- newLength = len(values)
- fill_vec = np.empty(newLength, dtype=np.int32)
- for i in range(newLength):
- idx = values[i]
- if idx in oldMap:
- fill_vec[i] = oldMap[idx]
- else:
- fill_vec[i] = -1
+ while True:
+ if j == nright - 1:
+ break
- return fill_vec
+ if i == nleft - 1:
+ while j < nright and new[j] >= cur:
+ indexer[j] = i
+ j += 1
+ break
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def merge_indexer_bool(ndarray[uint8_t] values, dict oldMap):
- cdef Py_ssize_t i, j, length, newLength
- cdef uint8_t idx
- cdef ndarray[int32_t] fill_vec
+ next = old[i + 1]
- newLength = len(values)
- fill_vec = np.empty(newLength, dtype=np.int32)
- for i in range(newLength):
- idx = values[i]
- if idx in oldMap:
- fill_vec[i] = oldMap[idx]
- else:
- fill_vec[i] = -1
+ while j < nright and cur <= new[j] < next:
+ indexer[j] = i
+ j += 1
- return fill_vec
+ i += 1
+ cur = next
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def pad_float64(ndarray[float64_t] oldIndex,
- ndarray[float64_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef float64_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+def pad_object(ndarray[object] old, ndarray[object] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef object cur, next
+ cdef int lim
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- if oldLength == 0 or newLength == 0:
- return fill_vec
-
- oldPos = 0
- newPos = 0
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
- while newPos < newLength:
- curOld = oldIndex[oldPos]
+ i = j = 0
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
+ cur = old[0]
- curLoc = oldMap[curOld]
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ while True:
+ if j == nright - 1:
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
+ if i == nleft - 1:
+ while j < nright and new[j] >= cur:
+ indexer[j] = i
+ j += 1
+ break
- if newPos > newLength - 1:
- done = 1
- break
+ next = old[i + 1]
- if done:
- break
+ while j < nright and cur <= new[j] < next:
+ indexer[j] = i
+ j += 1
- oldPos += 1
+ i += 1
+ cur = next
- return fill_vec
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def pad_object(ndarray[object] oldIndex,
- ndarray[object] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef object prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef int32_t cur, next
+ cdef int lim
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- if oldLength == 0 or newLength == 0:
- return fill_vec
-
- oldPos = 0
- newPos = 0
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
- while newPos < newLength:
- curOld = oldIndex[oldPos]
+ i = j = 0
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
+ cur = old[0]
- curLoc = oldMap[curOld]
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ while True:
+ if j == nright - 1:
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
+ if i == nleft - 1:
+ while j < nright and new[j] >= cur:
+ indexer[j] = i
+ j += 1
+ break
- if newPos > newLength - 1:
- done = 1
- break
+ next = old[i + 1]
- if done:
- break
+ while j < nright and cur <= new[j] < next:
+ indexer[j] = i
+ j += 1
- oldPos += 1
+ i += 1
+ cur = next
- return fill_vec
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def pad_int32(ndarray[int32_t] oldIndex,
- ndarray[int32_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef int32_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
-
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef int64_t cur, next
+ cdef int lim
- if oldLength == 0 or newLength == 0:
- return fill_vec
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- oldPos = 0
- newPos = 0
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
- while newPos < newLength:
- curOld = oldIndex[oldPos]
+ i = j = 0
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
+ cur = old[0]
- curLoc = oldMap[curOld]
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ while True:
+ if j == nright - 1:
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
+ if i == nleft - 1:
+ while j < nright and new[j] >= cur:
+ indexer[j] = i
+ j += 1
+ break
- if newPos > newLength - 1:
- done = 1
- break
+ next = old[i + 1]
- if done:
- break
+ while j < nright and cur <= new[j] < next:
+ indexer[j] = i
+ j += 1
- oldPos += 1
+ i += 1
+ cur = next
- return fill_vec
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def pad_int64(ndarray[int64_t] oldIndex,
- ndarray[int64_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef int64_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef uint8_t cur, next
+ cdef int lim
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- if oldLength == 0 or newLength == 0:
- return fill_vec
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- oldPos = 0
- newPos = 0
+ if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+ return indexer
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
+ i = j = 0
- while newPos < newLength:
- curOld = oldIndex[oldPos]
+ cur = old[0]
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
+ while j <= nright - 1 and new[j] < cur:
+ j += 1
- curLoc = oldMap[curOld]
+ while True:
+ if j == nright - 1:
+ break
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ if i == nleft - 1:
+ while j < nright and new[j] >= cur:
+ indexer[j] = i
+ j += 1
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
+ next = old[i + 1]
- if newPos > newLength - 1:
- done = 1
- break
+ while j < nright and cur <= new[j] < next:
+ indexer[j] = i
+ j += 1
- if done:
- break
+ i += 1
+ cur = next
- oldPos += 1
+ return indexer
- return fill_vec
@cython.boundscheck(False)
@cython.wraparound(False)
-def pad_bool(ndarray[uint8_t] oldIndex,
- ndarray[uint8_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef uint8_t prevOld, curOld
+def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef float64_t cur, prev
+ cdef int lim
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
-
- if oldLength == 0 or newLength == 0:
- return fill_vec
-
- oldPos = 0
- newPos = 0
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- if newIndex[newLength - 1] < oldIndex[0]:
- return fill_vec
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
- while newPos < newLength:
- curOld = oldIndex[oldPos]
+ i = nleft - 1
+ j = nright - 1
- while newIndex[newPos] < curOld:
- newPos += 1
- if newPos > newLength - 1:
- break
+ cur = old[nleft - 1]
- curLoc = oldMap[curOld]
+ while j >= 0 and new[j] > cur:
+ j -= 1
- if oldPos == oldLength - 1:
- if newIndex[newPos] >= curOld:
- fill_vec[newPos:] = curLoc
+ while True:
+ if j == 0:
break
- else:
- nextOld = oldIndex[oldPos + 1]
- done = 0
- while newIndex[newPos] < nextOld:
- fill_vec[newPos] = curLoc
- newPos += 1
-
- if newPos > newLength - 1:
- done = 1
- break
+ if i == 0:
+ while j >= 0 and new[j] <= cur:
+ indexer[j] = i
+ j -= 1
+ break
- if done:
- break
+ prev = old[i - 1]
- oldPos += 1
+ while j >= 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
- return fill_vec
+ i -= 1
+ cur = prev
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def backfill_float64(ndarray[float64_t] oldIndex,
- ndarray[float64_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef float64_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
-
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+def backfill_object(ndarray[object] old, ndarray[object] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef object cur, prev
+ cdef int lim
- if oldLength == 0 or newLength == 0:
- return fill_vec
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- oldPos = oldLength - 1
- newPos = newLength - 1
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- if newIndex[0] > oldIndex[oldLength - 1]:
- return fill_vec
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
- while newPos >= 0:
- curOld = oldIndex[oldPos]
+ i = nleft - 1
+ j = nright - 1
- while newIndex[newPos] > curOld:
- newPos -= 1
- if newPos < 0:
- break
+ cur = old[nleft - 1]
- curLoc = oldMap[curOld]
+ while j >= 0 and new[j] > cur:
+ j -= 1
- if oldPos == 0:
- if newIndex[newPos] <= curOld:
- fill_vec[:newPos + 1] = curLoc
+ while True:
+ if j == 0:
break
- else:
- prevOld = oldIndex[oldPos - 1]
-
- while newIndex[newPos] > prevOld:
- fill_vec[newPos] = curLoc
-
- newPos -= 1
- if newPos < 0:
- break
- oldPos -= 1
-
- return fill_vec
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_object(ndarray[object] oldIndex,
- ndarray[object] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef object prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
-
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
-
- if oldLength == 0 or newLength == 0:
- return fill_vec
-
- oldPos = oldLength - 1
- newPos = newLength - 1
-
- if newIndex[0] > oldIndex[oldLength - 1]:
- return fill_vec
- while newPos >= 0:
- curOld = oldIndex[oldPos]
-
- while newIndex[newPos] > curOld:
- newPos -= 1
- if newPos < 0:
- break
-
- curLoc = oldMap[curOld]
-
- if oldPos == 0:
- if newIndex[newPos] <= curOld:
- fill_vec[:newPos + 1] = curLoc
+ if i == 0:
+ while j >= 0 and new[j] <= cur:
+ indexer[j] = i
+ j -= 1
break
- else:
- prevOld = oldIndex[oldPos - 1]
- while newIndex[newPos] > prevOld:
- fill_vec[newPos] = curLoc
+ prev = old[i - 1]
+
+ while j >= 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
- newPos -= 1
- if newPos < 0:
- break
- oldPos -= 1
+ i -= 1
+ cur = prev
- return fill_vec
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def backfill_int32(ndarray[int32_t] oldIndex,
- ndarray[int32_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef int32_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef int32_t cur, prev
+ cdef int lim
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- if oldLength == 0 or newLength == 0:
- return fill_vec
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- oldPos = oldLength - 1
- newPos = newLength - 1
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
- if newIndex[0] > oldIndex[oldLength - 1]:
- return fill_vec
+ i = nleft - 1
+ j = nright - 1
- while newPos >= 0:
- curOld = oldIndex[oldPos]
+ cur = old[nleft - 1]
- while newIndex[newPos] > curOld:
- newPos -= 1
- if newPos < 0:
- break
+ while j >= 0 and new[j] > cur:
+ j -= 1
- curLoc = oldMap[curOld]
+ while True:
+ if j == 0:
+ break
- if oldPos == 0:
- if newIndex[newPos] <= curOld:
- fill_vec[:newPos + 1] = curLoc
+ if i == 0:
+ while j >= 0 and new[j] <= cur:
+ indexer[j] = i
+ j -= 1
break
- else:
- prevOld = oldIndex[oldPos - 1]
- while newIndex[newPos] > prevOld:
- fill_vec[newPos] = curLoc
+ prev = old[i - 1]
+
+ while j >= 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
- newPos -= 1
- if newPos < 0:
- break
- oldPos -= 1
+ i -= 1
+ cur = prev
- return fill_vec
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def backfill_int64(ndarray[int64_t] oldIndex,
- ndarray[int64_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef int64_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef int64_t cur, prev
+ cdef int lim
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- if oldLength == 0 or newLength == 0:
- return fill_vec
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- oldPos = oldLength - 1
- newPos = newLength - 1
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
- if newIndex[0] > oldIndex[oldLength - 1]:
- return fill_vec
+ i = nleft - 1
+ j = nright - 1
- while newPos >= 0:
- curOld = oldIndex[oldPos]
+ cur = old[nleft - 1]
- while newIndex[newPos] > curOld:
- newPos -= 1
- if newPos < 0:
- break
+ while j >= 0 and new[j] > cur:
+ j -= 1
- curLoc = oldMap[curOld]
+ while True:
+ if j == 0:
+ break
- if oldPos == 0:
- if newIndex[newPos] <= curOld:
- fill_vec[:newPos + 1] = curLoc
+ if i == 0:
+ while j >= 0 and new[j] <= cur:
+ indexer[j] = i
+ j -= 1
break
- else:
- prevOld = oldIndex[oldPos - 1]
- while newIndex[newPos] > prevOld:
- fill_vec[newPos] = curLoc
+ prev = old[i - 1]
+
+ while j >= 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
- newPos -= 1
- if newPos < 0:
- break
- oldPos -= 1
+ i -= 1
+ cur = prev
- return fill_vec
+ return indexer
@cython.boundscheck(False)
@cython.wraparound(False)
-def backfill_bool(ndarray[uint8_t] oldIndex,
- ndarray[uint8_t] newIndex,
- dict oldMap, dict newMap):
- cdef Py_ssize_t i, j, oldLength, newLength, curLoc
- cdef ndarray[int32_t, ndim=1] fill_vec
- cdef Py_ssize_t newPos, oldPos
- cdef uint8_t prevOld, curOld
-
- oldLength = len(oldIndex)
- newLength = len(newIndex)
+def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+ limit=None):
+ cdef Py_ssize_t i, j, nleft, nright
+ cdef ndarray[int32_t, ndim=1] indexer
+ cdef uint8_t cur, prev
+ cdef int lim
- fill_vec = np.empty(len(newIndex), dtype = np.int32)
- fill_vec.fill(-1)
+ nleft = len(old)
+ nright = len(new)
+ indexer = np.empty(nright, dtype=np.int32)
+ indexer.fill(-1)
- if oldLength == 0 or newLength == 0:
- return fill_vec
+ if limit is None:
+ lim = nright
+ else:
+ # TODO: > 0?
+ lim = limit
- oldPos = oldLength - 1
- newPos = newLength - 1
+ if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+ return indexer
- if newIndex[0] > oldIndex[oldLength - 1]:
- return fill_vec
+ i = nleft - 1
+ j = nright - 1
- while newPos >= 0:
- curOld = oldIndex[oldPos]
+ cur = old[nleft - 1]
- while newIndex[newPos] > curOld:
- newPos -= 1
- if newPos < 0:
- break
+ while j >= 0 and new[j] > cur:
+ j -= 1
- curLoc = oldMap[curOld]
+ while True:
+ if j == 0:
+ break
- if oldPos == 0:
- if newIndex[newPos] <= curOld:
- fill_vec[:newPos + 1] = curLoc
+ if i == 0:
+ while j >= 0 and new[j] <= cur:
+ indexer[j] = i
+ j -= 1
break
- else:
- prevOld = oldIndex[oldPos - 1]
- while newIndex[newPos] > prevOld:
- fill_vec[newPos] = curLoc
+ prev = old[i - 1]
+
+ while j >= 0 and prev < new[j] <= cur:
+ indexer[j] = i
+ j -= 1
- newPos -= 1
- if newPos < 0:
- break
- oldPos -= 1
+ i -= 1
+ cur = prev
- return fill_vec
+ return indexer
@cython.boundscheck(False)
View
6 pandas/src/groupby.pyx
@@ -111,6 +111,7 @@ def is_lexsorted(list list_of_arrays):
int i
Py_ssize_t n, nlevels
int32_t k, cur, pre
+ ndarray arr
nlevels = len(list_of_arrays)
n = len(list_of_arrays[0])
@@ -118,6 +119,9 @@ def is_lexsorted(list list_of_arrays):
cdef int32_t **vecs = <int32_t**> malloc(nlevels * sizeof(int32_t*))
for i from 0 <= i < nlevels:
vecs[i] = <int32_t *> (<ndarray> list_of_arrays[i]).data
+
+ arr = list_of_arrays[i]
+ vecs[i] = <int32_t *> arr.data
# assume uniqueness??
for i from 1 <= i < n:
@@ -443,7 +447,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
# if we've seen some values, mark bin
if vc != 0:
- bins[bc] = j
+ bins[bc] = j
bc += 1
vc = 0
View
51 pandas/src/hashtable.pxd
@@ -0,0 +1,51 @@
+from khash cimport *
+
+# prototypes for sharing
+
+# cdef class StringHashTable:
+# cdef kh_str_t *table
+
+# cdef inline int check_type(self, object)
+# cpdef get_item(self, object)
+# cpdef set_item(self, object, Py_ssize_t)
+
+# cdef class Int32HashTable:
+# cdef kh_int32_t *table
+
+# cdef inline int check_type(self, object)
+# cpdef get_item(self, int32_t)
+# cpdef set_item(self, int32_t, Py_ssize_t)
+
+# cdef class Int64HashTable:
+# cdef kh_int64_t *table
+
+# cdef inline bint has_key(self, int64_t)
+# cpdef get_item(self, int64_t)
+# cpdef set_item(self, int64_t, Py_ssize_t)
+
+
+# cdef class Float64HashTable:
+# cdef kh_float64_t *table
+
+# cpdef get_labels(self, ndarray, list, Py_ssize_t, int32_t)
+
+
+# cdef class PyObjectHashTable:
+# cdef kh_pymap_t *table
+
+# cdef destroy(self)
+# cpdef get_item(self, object)
+# cpdef set_item(self, object, Py_ssize_t)
+# cpdef get_labels(self, ndarray, list, Py_ssize_t, int32_t)
+
+
+# cdef class Factorizer:
+# cdef public PyObjectHashTable table
+# cdef public uniques
+# cdef public Py_ssize_t count
+
+
+# cdef class Int64Factorizer:
+# cdef public Int64HashTable table
+# cdef public list uniques
+# cdef public Py_ssize_t count
View
253 pandas/src/hashtable.pyx
@@ -1,171 +1,50 @@
-from khash cimport *
-
-def test(ndarray arr, Py_ssize_t size_hint):
- cdef:
- kh_pymap_t *table
- int ret = 0
- khiter_t k
- PyObject **data
- Py_ssize_t i, n
- ndarray[Py_ssize_t] indexer
-
- table = kh_init_pymap()
- kh_resize_pymap(table, size_hint)
-
- data = <PyObject**> arr.data
- n = len(arr)
-
- indexer = np.empty(n, dtype=np.int_)
-
- for i in range(n):
- k = kh_put_pymap(table, data[i], &ret)
-
- # if not ret:
- # kh_del_pymap(table, k)
-
- table.vals[k] = i
-
- for i in range(n):
- k = kh_get_pymap(table, data[i])
- indexer[i] = table.vals[k]
-
- kh_destroy_pymap(table)
-
- return indexer
-
-
-def test_str(ndarray arr, Py_ssize_t size_hint):
- cdef:
- kh_str_t *table
- kh_cstr_t val
- int ret = 0
- khiter_t k
- PyObject **data
- Py_ssize_t i, n
- ndarray[Py_ssize_t] indexer
-
- table = kh_init_str()
- kh_resize_str(table, size_hint)
+from cpython cimport PyObject
- data = <PyObject**> arr.data
- n = len(arr)
-
- indexer = np.empty(n, dtype=np.int_)
-
- for i in range(n):
- k = kh_put_str(table, util.get_c_string(<object> data[i]), &ret)
-
- # if not ret:
- # kh_del_str(table, k)
-
- table.vals[k] = i
-
- # for i in range(n):
- # k = kh_get_str(table, PyString_AsString(<object> data[i]))
- # indexer[i] = table.vals[k]
-
- kh_destroy_str(table)
-
- return indexer
-
-# def test2(ndarray[object] arr):
-# cdef:
-# dict table
-# object obj
-# Py_ssize_t i, loc, n
-# ndarray[Py_ssize_t] indexer
+from khash cimport *
+from numpy cimport *
-# n = len(arr)
-# indexer = np.empty(n, dtype=np.int_)
+from util cimport _checknan
+cimport util
-# table = {}
-# for i in range(n):
-# table[arr[i]] = i
+import numpy as np
-# for i in range(n):
-# indexer[i] = table[arr[i]]
+ONAN = np.nan
-# return indexer
-def obj_unique(ndarray[object] arr):
+def list_to_object_array(list obj):
+ '''
+ Convert list to object ndarray. Seriously can't believe I had to write this
+ function
+ '''
cdef:
- kh_pyset_t *table
- # PyObject *obj
- object obj
- PyObject **data
- int ret = 0
- khiter_t k
Py_ssize_t i, n
- list uniques
-
- n = len(arr)
- uniques = []
-
- table = kh_init_pyset()
-
- data = <PyObject**> arr.data
-
- # size hint
- kh_resize_pyset(table, n // 10)
-
- for i in range(n):
- obj = arr[i]
-
- k = kh_get_pyset(table, <PyObject*> obj)
- if not kh_exist_pyset(table, k):
- k = kh_put_pyset(table, <PyObject*> obj, &ret)
- # uniques.append(obj)
- # Py_INCREF(<object> obj)
-
- kh_destroy_pyset(table)
+ ndarray[object] arr
- return None
+ n = len(obj)
+ arr = np.empty(n, dtype=object)
-def int64_unique(ndarray[int64_t] arr):
- cdef:
- kh_int64_t *table
- # PyObject *obj
- int64_t obj
- PyObject **data
- int ret = 0
- khiter_t k
- Py_ssize_t i, j, n
- ndarray[int64_t] uniques
-
- n = len(arr)
- uniques = np.empty(n, dtype='i8')
-
- table = kh_init_int64()
- kh_resize_int64(table, n)
-
- j = 0
+ for i from 0 <= i < n:
+ arr[i] = obj[i]
- for i in range(n):
- obj = arr[i]
+ return arr
- k = kh_get_int64(table, obj)
- if not kh_exist_int64(table, k):
- k = kh_put_int64(table, obj, &ret)
- uniques[j] = obj
- j += 1
- # Py_INCREF(<object> obj)
- kh_destroy_int64(table)
+cdef class HashTable:
+ pass
- return np.sort(uniques[:j])
-cdef class StringHashTable:
+cdef class StringHashTable(HashTable):
+ cdef kh_str_t *table
- cdef:
- kh_str_t *table