In [61]:
_="""
Gotta make the bucket split / remove logic perfect.

Problems to solve:
- Initialization needs correct buckets
- Buckets must remain correct as they are split and removed
- 

Correct buckets:
- Must span the entire hash space with no gaps
- No 2 buckets should have the same hash
- OK more rules: 
  - A bucket must cover the leftmost value (HASH_MIN)
  - A bucket must cover each hash value that is == (1 + a DictBucket hash value)
  - We could just leave 0-sized buckets alone? nooooo. Bad idea.
  - On item removal, if bucket size < min, merge it with adjacent buckets if possible.
  
Bonus:
Is it worth considering merge() logic as tems are removed? Right now it is possible to have a lot of 
buckets of size 1, if all but 1 element were removed. That would still be correct, just inefficient on RAM.

"""

In [62]:
from sortedcollections import SortedDict
from collections import Counter
from bisect import bisect_left, bisect_right

In [3]:
from sortedcontainers import SortedDict as SD

In [4]:
SD == SortedDict

True

Set up a toy problem - the only complexity is in the part we're trying to solve. 

The bucket contents are intentionally simplified here.

In [30]:
hash_min = 0
hash_max = 10
num_range = 100

bucket_max_size=5
bucket_min_size=1  # ?

class TallBucket:
    # stores a single hash value, no max value
    def __init__(self):
        self.counter = Counter()  # value: count
        self.my_hash = None
     
    def add(self, value):
        if self.my_hash is None:
            self.my_hash = value % max_hash
        assert value % max_hash == self.my_hash
        count = self.counter.get(value, 0)
        self.counter[value] = count + 1
    
    def remove(self, value):
        self.counter[value] -= 1
        if self.counter[value] == 0:
            del self.counter[value]
    
    def __len__(self):
        return sum(self.counter.values())

    
class WideBucket:
    # stores a range of hashes, max items: bucket_limit 
    def __init__(self):
        self.counter = Counter()  # value: count

    def add(self, value):
        count = self.counter.get(value, 0)
        self.counter[value] = count + 1

    def remove(self, value):
        self.counter[value] -= 1
        if self.counter[value] == 0:
            del self.counter[value]
        
    def __len__(self):
        return sum(self.counter.values())


class BucketList:
    
    def __init__(self):
        self.buckets = SortedDict()
        self.buckets[hash_min] = WideBucket()

    def _find_bucket_key(self, val):
        val_hash = val % hash_max
        idx = self.buckets.bisect_right(val_hash) - 1
        return self.buckets.iloc[list_idx]
        
    def add(self, val):
        bucket_key = self._find_bucket_key(val)
        self.buckets[bucket_key].add(value)
        self.check_max(bucket_key)
    
    def remove(self, val):
        bucket_key = self._find_bucket_key(val)
        self.buckets[bucket_key].remove(value)
        self.check_min(bucket_key)
    
    def check_max(self, bucket_key):
        b = self.buckets[bucket_key]
        if isinstance(b, WideBucket) and len(b) > bucket_max_size:
            pass
            # invariant violated. Fix it
    
    def check_min(self, bucket_key):
        b = self.buckets[bucket_key]
        if len(b) < bucket_min_size:
            # invariant violated. Fix it
            left_key, right_key = self.get_neighbors(bucket_key)
            # Case 1: There is a hash bucket to the left. 
            # Just move any bucket's items there, and delete this bucket, no rearrange needed.
            # Case 2: The left neighbor is nonexistent, or is a DictBucket. Let's look right.
            #   Case 2a. The right neighbor is nonexistent, or is a DictBucket. 
            #       We cannot delete this bucket, even if empty. Done.
            #   Case 2b. The right neighbor is a hash bucket. We can extend it left to cover this space.
    
        
    def get_neighbors(self, bucket_key):
        try:
            left_idx = self.buckets.bisect_left(bucket_key-1)
            left_key = self.buckets.iloc[left_idx]
        except IndexError:
            left_key = None
        try:
            right_idx = self.buckets.bisect_right(bucket_key)
            right_key = self.buckets.iloc[right_idx]
        except IndexError:
            right_key = None
        return left_key, right_key

In [60]:
bucket_plans = [
    (10, 'd'),
    (30, 'h'),
    (50, 'h'),
    (70, 'd'),
    (90, 'd')
]

def fix_gaps(bucket_plans):
    fixed_plans = []
    next_needed = hash_min
    for b in bucket_plans:
        print(b)
        mh, btype = b
        # resolve any gaps
        if next_needed is not None and mh > next_needed:
            if btype == 'h':
                # expand this bucket to the left
                mh = next_needed
                next_needed = None
            else:  # btype == 'd'
                # this is a dictbucket; add a hashbucket to fill the empty space
                fixed_plans.append((next_needed, 'h'))
        # add this bucket
        fixed_plans.append((mh, btype))
        if btype == 'd':
            next_needed = mh+1
        else:
            next_needed = None
    # handle last dictbucket if needed
    if next_needed is not None:
        fixed_plans.append((next_needed, 'h'))
    return fixed_plans

print(fix_gaps(bucket_plans))

(10, 'd')
(30, 'h')
(50, 'h')
(70, 'd')
(90, 'd')
[(0, 'h'), (10, 'd'), (11, 'h'), (50, 'h'), (70, 'd'), (71, 'h'), (90, 'd'), (91, 'h')]


In [None]:
def fix_gaps(bucket_plans):
    fixed_plans = []
    next_needed = hash_min
    for b in bucket_plans:
        print(b)
        mh, btype = b
        # resolve any gaps
        if next_needed is not None and mh > next_needed:
            if btype == 'h':
                # expand this bucket to the left
                mh = next_needed
                next_needed = None
            else:  # btype == 'd'
                # this is a dictbucket; add a hashbucket to fill the empty space
                fixed_plans.append((next_needed, 'h'))
        # add this bucket
        fixed_plans.append((mh, btype))
        if btype == 'd':
            next_needed = mh+1
        else:
            next_needed = None
    # handle last dictbucket if needed
    if next_needed is not None:
        fixed_plans.append((next_needed, 'h'))
    return fixed_plans


In [None]:
sd = SortedDict()
