```
we want to change from

lookup = {value: {objs}}

to 

consistent_hash(value) = bucket_id
lookup = {bucket_id: {objs}}
```

In [184]:
from bisect import bisect_left
import random
import time
import numpy as np
from sortedcontainers import SortedDict
from pympler.asizeof import asizeof


In [183]:
_='''
OK, so the implementation of choice is a 
SortedDict of {min_value: Bucket}
and a heap of {size: min_value} containing the splittable buckets that are big.

Bucket knows the keys it contains as well as their counts. 
If asked for a split point, it will give one that best bisects its keys. That's O(log(n)) probably.
You can custom-write a bisection for that. 
'''

In [35]:
class Item:
    def __init__(self):
        self.s = random.choice(['mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune'])
        self.x = random.random()
    
    def __str__(self):
        return f'{self.s} {round(self.x, 2)}'

In [36]:
class Fieldidx():
    def __init__(self):
        self.n_buckets = 10
        self.idx = dict(zip(range(self.n_buckets), 
                            [set() for _ in range(self.n_buckets)]))
        self.size = 0
        self.objs = dict()
        self.field = 's'
    
    def _get_bucket_for(self, val):
        h = hash(val)
        bucket = h % len(self.idx)
        return bucket
    
    def _resize(self):
        pass
    
    def add(self, item):
        ptr = id(item)
        val = getattr(item, self.field, None)
        bucket = self._get_bucket_for(val)
        self.idx[bucket].add(ptr)
        self.objs[ptr] = item
        self.size += 1
        
    def get(self, val):
        bucket = self._get_bucket_for(val)
        matches = []
        for obj_id in self.idx[bucket]:
            obj = self.objs[obj_id]
            obj_val = getattr(obj, self.field, None)
            if obj_val is val or obj_val == val:
                matches.append(obj)
        return matches

In [38]:
idx = Fieldidx()
for _ in range(100):
    idx.add(Item())
z = idx.get('venus')
for i in z:
    print(i)

venus 0.16
venus 0.93
venus 0.75
venus 0.21
venus 0.69
venus 0.04
venus 0.05
venus 0.56
venus 0.72
venus 0.42
venus 0.86
venus 0.14
venus 0.05
venus 0.21


Best idea is:
    
 - hash values to lie in some big range, like uint64
 - Initially, we have like 10 buckets containing even chunks of that range (pretend we have a good hash function...)
 - Maintain a data structure with easy max and min access (sorted deque? heaps? dict?) sorted by the number of elements stored in the bucket. Maybe have another one for n_keys or something too, we don't wanna keep trying to split one bucket that has a single high-card key in it. Ugh.
 - Anyway, split the biggest bucket when it comes time for adding more buckets. When do we add more buckets? Shit.
 - Wait. When a bucket is unsplittable, we could take it outta the list. It's its own thing now. That would work.
 - We might have to put it back in someday.
 

In [201]:
# initialize with 1 bucket spanning whole range
# split when there are >1000 items in a splittable bucket
n_bits_signed = sys.hash_info.hash_bits - 1  # typically 64 bits
HASH_MIN = -2**n_bits_signed
HASH_MAX = 2**n_bits_signed-1

In [311]:
# These are the mutable version of Bucket and ObjLookup. 

class Bucket:
    def __init__(self):
        self.obj_ids = set()  # uint64
        self.val_hashes = set()  # int64 - which hashes are stored in this bucket
    
    def add(self, val_hash, obj_id):
        self.val_hashes.add(val_hash)
        self.obj_ids.add(obj_id)
    
    def update(self, new_val_hashes, new_obj_ids):
        self.val_hashes = self.val_hashes.union(new_val_hashes)
        self.obj_ids = self.obj_ids.union(new_obj_ids)
    
    def get_matching_objs(self, field, val, obj_lookup):
        # look through all our obj_ids to see what matches this val, and return those.
        objs = [obj_lookup.get(o) for o in self.obj_ids]
        matches = []
        # filter to just the ones that match. could use a filter() here maybe, or comprehension.
        for obj in objs:
            obj_val = getattr(obj, field, None)
            if obj_val is val or obj_val == val:
                matches.append(obj)
        return matches
    
    def get_all_objs(self, obj_lookup):
        return [obj_lookup.get(o) for o in self.obj_ids]
    
    def split(self, field, obj_lookup):
        my_hashes = list(sorted(self.val_hashes))
        # dump out the upper half of our hashes
        half_point = len(my_hashes) // 2
        dumped_hashes = set(my_hashes[half_point:])
        
        # dereference each object 
        # Find the objects with field_vals that hash to any of dumped_hashes
        # we will move their ids to the new bucket
        dumped_obj_ids = set()
        for obj_id in list(self.obj_ids):
            obj = obj_lookup.get(obj_id)
            obj_val = getattr(obj, field, None)
            if hash(obj_val) in dumped_hashes:
                dumped_obj_ids.add(obj_id)
                self.obj_ids.remove(obj_id)
        for dh in dumped_hashes:
            self.val_hashes.remove(dh)
        return dumped_hashes, dumped_obj_ids
        
    def __len__(self):
        return len(self.obj_ids)
    

class ObjLookup:
    
    def __init__(self):
        self.objs = dict()
        
    def get(self, obj_id):
        return self.objs.get(obj_id)
    
    def set(self, obj_id, obj):
        self.objs[obj_id] = obj

In [321]:
SIZE_THRESH_UPPER = 30
SIZE_THRESH_LOWER = 3

class Field:
    # Stores the possible values of this field in a set of buckets
    # Several values may be allocated to the same bucket for space efficiency reasons
    def __init__(self, field):
        self.buckets = SortedDict()  # O(1) add / remove, O(log(n)) find bucket for key
        self.buckets[HASH_MIN] =  Bucket()  # always contains at least one bucket
        self.objs = ObjLookup()
        self.field = field
    
    def get(self, field_value):
        val_hash = hash(field_value)
        k = self._get_bucket_key_for(val_hash)
        return self.buckets[k].get_matching_objs(self.field, field_value, self.objs)
        
    def _get_bucket_key_for(self, val_hash):
        list_idx = self.buckets.bisect_right(val_hash) - 1
        k, _ = self.buckets.peekitem(list_idx)
        return k
        
    def add(self, obj):
        field_value = getattr(obj, self.field, None)
        val_hash = hash(field_value)
        obj_id = id(obj)
        self.objs.set(obj_id, obj)
        k = self._get_bucket_key_for(val_hash)
        self.buckets[k].add(val_hash, obj_id)
        # split bucket if it's big and contains more than one key
        if len(self.buckets[k]) > SIZE_THRESH_UPPER and len(self.buckets[k].val_hashes) >= 2:
            new_hashes, new_obj_ids = self.buckets[k].split(self.field, self.objs)
            new_bucket = Bucket()
            new_bucket.update(new_hashes, new_obj_ids)
            self.buckets[min(new_hashes)] = new_bucket
    
    def remove(self, field_value, obj_id):
        k = self._get_bucket_key_for(field_value)
        self.buckets[k].remove(key, obj_id)
        if self.buckets[k].size < SIZE_THRESH_LOWER and k != HASH_MIN:
            # try and merge it with its neighbor to the left
            idx_j = self.buckets.bisect_left(k-1)
            j, _ = self.buckets.peekitem(idx_j)
            if self.buckets[j].size + self.buckets[i].size < SIZE_THRESH_UPPER:
                self.buckets[j].update(self.buckets[i].stuff)
                del self.buckets[i]
                
    def bucket_report(self):
        ls = []
        for bkey in self.buckets:
            bucket = self.buckets[bkey]
            bset = set()
            for o in bucket.get_all_objs(self.objs):
                bset.add(getattr(o, self.field))
            ls.append((bkey, bset))
        return ls

In [322]:
idx = Field('s')
for _ in range(100):
    idx.add(Item())
z = idx.get('mars')
ct = 0
for i in z:
    print(i)
    ct += 1
    if ct > 10:
        break
print(sorted(idx.buckets.keys()))
for b in idx.bucket_report():
    print(b)


mars 0.43
mars 0.45
mars 0.34
mars 0.98
mars 0.65
mars 0.09
mars 0.7
mars 0.05
mars 0.44
mars 0.91
mars 0.64
[-9223372036854775808, -1783096510027627915, -588565948556953553, 924058623898380127]
(-9223372036854775808, {'earth', 'venus'})
(-1783096510027627915, {'mars', 'uranus'})
(-588565948556953553, {'saturn', 'mercury'})
(924058623898380127, {'neptune', 'jupiter'})


In [283]:
asizeof(Bucket(10))

TypeError: __init__() takes 1 positional argument but 2 were given

In [140]:
import numpy as np

In [141]:
%%timeit -n 5 -r 5
np.zeros((10**6,))

356 µs ± 241 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [142]:
%%timeit -n 5 -r 5
ls = [0 for _ in range(10**6)]

23 ms ± 1.69 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [143]:
ls = [0 for _ in range(10**6)]


In [145]:
%%timeit -n 5 -r 5
del ls[10**5]

727 µs ± 317 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [146]:
# We can't really use lists at the rate we want to add / remove nodes. At 1ms per add / remove, that's 1s just to 
# build a 1M-node bucket list for a single key. Too slow.

In [159]:
%%timeit -n 5 -r 5
s = SortedDict()
for i in range(1000, -1, -1):
    s[i] = i

1.64 ms ± 491 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [191]:
t0 = time.time()
s = SortedDict()
for i in range(10**6, -1, -1):
    s[i] = Bucket(i)
t1 = time.time()
print(t1-t0)

2.850229024887085


In [193]:
# it costs 458 bytes per bucket to maintain this structure.
# That works out to 0.5 bytes per item, figuring 1000 items in each bucket. 
# Not disastrous.
round(asizeof(s) / 10**6)

458

In [173]:
%%timeit -n 5 -r 5
s.bisect_left(random.choice(range(10**3)))

4.97 µs ± 3.1 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [174]:
%%timeit -n 5 -r 5
bisect_left(ls, random.choice(range(10**5)))

2.45 µs ± 1.08 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [177]:
%%timeit -n 5 -r 5
del s[random.choice(range(10**5))]

5.33 µs ± 1.93 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)
