```
we want to change from

lookup = {value: {objs}}

to 

consistent_hash(value) = bucket_id
lookup = {bucket_id: {objs}}
```

In [411]:
from bisect import bisect_left
import random
import time
import numpy as np
from sortedcontainers import SortedDict
from pympler.asizeof import asizeof
import sortednp as snp
from operator import itemgetter

In [183]:
_='''
OK, so the implementation of choice is a 
SortedDict of {min_value: Bucket}
and a heap of {size: min_value} containing the splittable buckets that are big.

Bucket knows the keys it contains as well as their counts. 
If asked for a split point, it will give one that best bisects its keys. That's O(log(n)) probably.
You can custom-write a bisection for that. 
'''

In [346]:
PLANETS = ['mercury']*10000 + ['venus']*100 + ['earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune']
class Item:
    def __init__(self):
        self.s = random.choice(PLANETS)
        self.x = random.random()
    
    def __str__(self):
        return f'{self.s} {round(self.x, 2)}'

In [36]:
class Fieldidx():
    def __init__(self):
        self.n_buckets = 10
        self.idx = dict(zip(range(self.n_buckets), 
                            [set() for _ in range(self.n_buckets)]))
        self.size = 0
        self.objs = dict()
        self.field = 's'
    
    def _get_bucket_for(self, val):
        h = hash(val)
        bucket = h % len(self.idx)
        return bucket
    
    def _resize(self):
        pass
    
    def add(self, item):
        ptr = id(item)
        val = getattr(item, self.field, None)
        bucket = self._get_bucket_for(val)
        self.idx[bucket].add(ptr)
        self.objs[ptr] = item
        self.size += 1
        
    def get(self, val):
        bucket = self._get_bucket_for(val)
        matches = []
        for obj_id in self.idx[bucket]:
            obj = self.objs[obj_id]
            obj_val = getattr(obj, self.field, None)
            if obj_val is val or obj_val == val:
                matches.append(obj)
        return matches

In [38]:
idx = Fieldidx()
for _ in range(100):
    idx.add(Item())
z = idx.get('venus')
for i in z:
    print(i)

venus 0.16
venus 0.93
venus 0.75
venus 0.21
venus 0.69
venus 0.04
venus 0.05
venus 0.56
venus 0.72
venus 0.42
venus 0.86
venus 0.14
venus 0.05
venus 0.21


Best idea is:
    
 - hash values to lie in some big range, like uint64
 - Initially, we have like 10 buckets containing even chunks of that range (pretend we have a good hash function...)
 - Maintain a data structure with easy max and min access (sorted deque? heaps? dict?) sorted by the number of elements stored in the bucket. Maybe have another one for n_keys or something too, we don't wanna keep trying to split one bucket that has a single high-card key in it. Ugh.
 - Anyway, split the biggest bucket when it comes time for adding more buckets. When do we add more buckets? Shit.
 - Wait. When a bucket is unsplittable, we could take it outta the list. It's its own thing now. That would work.
 - We might have to put it back in someday.
 

In [201]:
# initialize with 1 bucket spanning whole range
# split when there are >1000 items in a splittable bucket
n_bits_signed = sys.hash_info.hash_bits - 1  # typically 64 bits
HASH_MIN = -2**n_bits_signed
HASH_MAX = 2**n_bits_signed-1

In [311]:
# These are the mutable version of Bucket and ObjLookup. 

class Bucket:
    def __init__(self):
        self.obj_ids = set()  # uint64
        self.val_hashes = set()  # int64 - which hashes are stored in this bucket
    
    def add(self, val_hash, obj_id):
        self.val_hashes.add(val_hash)
        self.obj_ids.add(obj_id)
    
    def update(self, new_val_hashes, new_obj_ids):
        self.val_hashes = self.val_hashes.union(new_val_hashes)
        self.obj_ids = self.obj_ids.union(new_obj_ids)
    
    def get_matching_objs(self, field, val, obj_lookup):
        # look through all our obj_ids to see what matches this val, and return those.
        objs = [obj_lookup.get(o) for o in self.obj_ids]
        matches = []
        # filter to just the ones that match. could use a filter() here maybe, or comprehension.
        for obj in objs:
            obj_val = getattr(obj, field, None)
            if obj_val is val or obj_val == val:
                matches.append(obj)
        return matches
    
    def get_all_objs(self, obj_lookup):
        return [obj_lookup.get(o) for o in self.obj_ids]
    
    def split(self, field, obj_lookup):
        my_hashes = list(sorted(self.val_hashes))
        # dump out the upper half of our hashes
        half_point = len(my_hashes) // 2
        dumped_hashes = set(my_hashes[half_point:])
        
        # dereference each object 
        # Find the objects with field_vals that hash to any of dumped_hashes
        # we will move their ids to the new bucket
        dumped_obj_ids = set()
        for obj_id in list(self.obj_ids):
            obj = obj_lookup.get(obj_id)
            obj_val = getattr(obj, field, None)
            if hash(obj_val) in dumped_hashes:
                dumped_obj_ids.add(obj_id)
                self.obj_ids.remove(obj_id)
        for dh in dumped_hashes:
            self.val_hashes.remove(dh)
        return dumped_hashes, dumped_obj_ids
        
    def __len__(self):
        return len(self.obj_ids)
    

class ObjLookup:
    
    def __init__(self):
        self.objs = dict()
        
    def get(self, obj_id):
        return self.objs.get(obj_id)
    
    def set(self, obj_id, obj):
        self.objs[obj_id] = obj

In [349]:
SIZE_THRESH_UPPER = 17
SIZE_THRESH_LOWER = 3

class Field:
    # Stores the possible values of this field in a set of buckets
    # Several values may be allocated to the same bucket for space efficiency reasons
    def __init__(self, field):
        self.buckets = SortedDict()  # O(1) add / remove, O(log(n)) find bucket for key
        self.buckets[HASH_MIN] =  Bucket()  # always contains at least one bucket
        self.objs = ObjLookup()
        self.field = field
    
    def get(self, field_value):
        val_hash = hash(field_value)
        k = self._get_bucket_key_for(val_hash)
        return self.buckets[k].get_matching_objs(self.field, field_value, self.objs)
        
    def _get_bucket_key_for(self, val_hash):
        list_idx = self.buckets.bisect_right(val_hash) - 1
        k, _ = self.buckets.peekitem(list_idx)
        return k
        
    def add(self, obj):
        field_value = getattr(obj, self.field, None)
        val_hash = hash(field_value)
        obj_id = id(obj)
        self.objs.set(obj_id, obj)
        k = self._get_bucket_key_for(val_hash)
        self.buckets[k].add(val_hash, obj_id)
        # split bucket if it's big and contains more than one key
        if len(self.buckets[k]) > SIZE_THRESH_UPPER and len(self.buckets[k].val_hashes) >= 2:
            new_hashes, new_obj_ids = self.buckets[k].split(self.field, self.objs)
            new_bucket = Bucket()
            new_bucket.update(new_hashes, new_obj_ids)
            self.buckets[min(new_hashes)] = new_bucket
    
    def remove(self, field_value, obj_id):
        k = self._get_bucket_key_for(field_value)
        self.buckets[k].remove(key, obj_id)
        if self.buckets[k].size < SIZE_THRESH_LOWER and k != HASH_MIN:
            # try and merge it with its neighbor to the left
            idx_j = self.buckets.bisect_left(k-1)
            j, _ = self.buckets.peekitem(idx_j)
            if self.buckets[j].size + self.buckets[i].size < SIZE_THRESH_UPPER:
                self.buckets[j].update(self.buckets[i].stuff)
                del self.buckets[i]
                
    def bucket_report(self):
        ls = []
        for bkey in self.buckets:
            bucket = self.buckets[bkey]
            bset = set()
            for o in bucket.get_all_objs(self.objs):
                bset.add(getattr(o, self.field))
            ls.append((bkey, bset, len(bucket)))
        return ls

In [347]:
idx = Field('s')
n = 10**6
items = [Item() for _ in range(n)]

In [361]:
print('adding', n, 'items')
t0 = time.time()
for item in items:
    idx.add(item)
t1 = time.time()
print('\n', round(t1-t0,3), 'seconds to build this field thing\n')
print(sorted(idx.buckets.keys()))
for b in idx.bucket_report():
    print(b)


adding 1000000 items

 2.618 seconds to build this field thing

[-9223372036854775808, -4657188873304669324, -1783096510027627915, -1390440632094239304, -588565948556953553, 288469824212975131, 924058623898380127, 4070670315453980499]
(-9223372036854775808, {'earth'}, 88)
(-4657188873304669324, {'venus'}, 9742)
(-1783096510027627915, {'uranus'}, 87)
(-1390440632094239304, {'mars'}, 85)
(-588565948556953553, {'mercury'}, 989715)
(288469824212975131, {'saturn'}, 94)
(924058623898380127, {'jupiter'}, 94)
(4070670315453980499, {'neptune'}, 95)


In [368]:
from hashindex import HashIndex

t0 = time.time()
hi = HashIndex(items, on='s')
t1 = time.time()
print(t1-t0, 'seconds to build a HashIndex')

hi.freeze()

0.8671374320983887 seconds to build a HashIndex


In [362]:
t0 = time.time()
d = dict()
for i in items:
    if i.s not in d:
        d[i.s] = list()
    d[i.s].append(i)
t1 = time.time()
print(t1-t0, 'seconds to build a dict')


0.1698291301727295 seconds to build a dict


In [425]:
planet = 'mercury'

In [426]:
%%timeit -n 5 -r 5
v = hi.find(match={'s': planet})

25 ms ± 1.34 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [427]:
%%timeit -n 5 -r 5
v = idx.get(planet)

500 ms ± 7.66 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [428]:
%%timeit -n 5 -r 5
v = d.get(planet)
# yikes - how is this 1000x faster? something has gone really wrong here! let's see if it's the deref lookup that's
# costing so much

181 ns ± 99.4 ns per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [417]:
class DerefDict():
    
    def __init__(self, items):
        self.objs = {id(item): item for item in items}
        self.d = dict()
        for i in items:
            if i.s not in self.d:
                self.d[i.s] = list()
            self.d[i.s].append(id(i))
    
    def get(self, val):
        ids = self.d.get(val)
        return [self.objs.get(i) for i in ids]

t0 = time.time()
dd  = DerefDict(items)
t1 = time.time()
print(t1-t0, 'seconds to build a deref dict')

0.41048765182495117 seconds to build a deref dict


In [390]:
%%timeit -n 5 -r 5
v = dd.get(planet)
# the difference is that you are doing len(planets) dict lookups instead of just one dict lookup.
# Can we keep the list literal around during processing instead?
# e.g. - most of the time we will want an entire list (simple lookup, no intersection). Detect that 
# case and we've got something as good as dict().

179 ms ± 13.4 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [419]:
class DerefDictListy():
    def __init__(self, items):
        self.objs = []
        self.d = dict()
        for i in items:
            if i.s not in self.d:
                self.d[i.s] = list()
            self.d[i.s].append(len(self.objs))
            self.objs.append(obj)
    
    def get(self, val):
        ids = self.d.get(val)
        return itemgetter(*ids)(self.objs)

t0 = time.time()
ddl  = DerefDict(items)
t1 = time.time()
print(t1-t0, 'seconds to build a deref dict, listy edition')

0.4190680980682373 seconds to build a deref dict, listy edition


In [420]:
%%timeit -n 5 -r 5
v = ddl.get(planet)
# the difference is that you are doing len(planets) dict lookups instead of just one dict lookup.
# Can we keep the list literal around during processing instead?
# e.g. - most of the time we will want an entire list (simple lookup, no intersection). Detect that 
# case and we've got something as good as dict().

13.2 µs ± 4.89 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [415]:
a = list('abcdefg')
isect_ids = snp.intersect(np.array([2,3,6]), np.array([1,2,3,4,5]), indices=True)[1][1]
itemgetter(*isect_ids)(a)

('b', 'c')

In [424]:
v = hi.find({'s': planet})
v

array([<__main__.Item object at 0x7f1cb7a44af0>,
       <__main__.Item object at 0x7f1cb7aca1f0>,
       <__main__.Item object at 0x7f1cb7ede310>,
       <__main__.Item object at 0x7f1cb83a4790>,
       <__main__.Item object at 0x7f1cb8530af0>,
       <__main__.Item object at 0x7f1cb88c6370>,
       <__main__.Item object at 0x7f1cb88c8df0>,
       <__main__.Item object at 0x7f1cb91a0b50>,
       <__main__.Item object at 0x7f1cb92c80d0>,
       <__main__.Item object at 0x7f1cb93a2cd0>,
       <__main__.Item object at 0x7f1cb94045b0>,
       <__main__.Item object at 0x7f1cb9488790>,
       <__main__.Item object at 0x7f1cb9656190>,
       <__main__.Item object at 0x7f1cb9664190>,
       <__main__.Item object at 0x7f1cb99001f0>,
       <__main__.Item object at 0x7f1cb9952970>,
       <__main__.Item object at 0x7f1cb9b96bb0>,
       <__main__.Item object at 0x7f1cb9bc6790>,
       <__main__.Item object at 0x7f1cb9c4a5b0>,
       <__main__.Item object at 0x7f1cba080610>,
       <__main__.Ite