In [95]:
from bisect import bisect_left,bisect_right
import numpy as np
from cykhash import Int64Set
import random
from BTrees.OOBTree import OOBTree
from pympler.asizeof import asizeof

### Numpy array tests

We'll use a sorted numpy array and bisection to mimic a tree.
- Check what happens when we have unsortable types
- Write a quick bisect implementation that gets array indices etc

In [23]:
# Mixed objects raises TypeError
weird_objs = [(i, i+1) for i in range(10)] + ['astring']
arr = np.array(weird_objs, dtype='O')
try:
    np.argsort(arr)
except TypeError as e:
    print(e)

'<' not supported between instances of 'str' and 'tuple'


In [26]:
# Non-comparable objects raises TypeError
class AnObj:
    def __init__(self):
        self.x = 3
    def __hash__(self):
        return 3
arr = np.array([AnObj(), AnObj()])
try:
    np.argsort(arr)
except TypeError as e:
    print(e)

'<' not supported between instances of 'AnObj' and 'AnObj'


### BTree Tests

Check that we can make a reasonable BTree implementation and make sure it performs well enough

In [42]:
bt = OOBTree()

for f in range(10):
    bt.insert(f, 's' + str(f))

In [45]:
for item in bt.values(4, 6):
    print(item)

s4
s5
s6


In [83]:
floats = [{'x': random.random()} for _ in range(10**6)] + [{'x':10}] + [{'x': 10}]  # a million + 2 items
ids = [id(f) for f in floats]
bt = OOBTree()

In [84]:
%%time
for i in range(len(floats)):
    bt.insert(floats[i]['x'], ids[i])

CPU times: user 2 s, sys: 0 ns, total: 2 s
Wall time: 2 s


In [85]:
# OK, so keys are expected to be unique. We'll have to handle that.
print(bt.get(2))
print(len(bt))

None
1000001


In [86]:
2 in bt

False

In [92]:
def add(bt, key, value):
    if key in bt:
        cur_val = bt.get(key)
        if type(cur_val) is Int64Set:
            cur_val.add(value)
        else:
            bt[key] = Int64Set([cur_val, value])
    else:
        bt[key] = value
        
bt = OOBTree()
for i in range(len(floats)):
    add(bt, floats[i]['x'], ids[i])

In [93]:
bt.get(10)

{140121445528896,140121445528960}

In [99]:
# can't measure ram usage directly, it needs to be external.
asizeof(bt)

128

In [100]:
%%timeit 
bt.get(10)

614 ns ± 10.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [104]:
d = {10: Int64Set([140121445528896,140121445528960])}
for f in floats:
    d[f['x']] = id(f)

In [105]:
%%timeit 
d.get(10)
# dicts are an order of magnitude faster on lookup. Food for thought there.

50 ns ± 0.776 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [129]:
len(list(bt.values(0.11, 0.12)))

9987

In [130]:
%%timeit
_ = list(bt.values(0.11, 0.12))
# getting a range of objects is decently fast... 


356 µs ± 4.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [131]:
# but doing that same number of dict lookups actually isn't much slower!
micros = round(50 * 10**-9 * 9987 * 10**6)
print(f"that many dict lookups would take {micros}µs")

that many dict lookups would take 499µs


In [143]:
vals = list(set(f['x'] for f in floats))

In [145]:
%%timeit
r = [v for v in vals if v < 0.12 and v > 0.11]
_ = list(d[v] for v in r)
# let's make it a more fair test -- we do need to prefilter for the range.

126 ms ± 4.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [147]:
%%timeit
r1 = [v for v in vals if v < 0.12]
r2 = [v for v in r1 if v > 0.11]
_ = list(d[v] for v in r2)
# let's make it a more fair test -- we do need to prefilter for the range.

144 ms ± 3.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [148]:
r = [v for v in vals if v < 0.12 and v > 0.11]

In [150]:
%%timeit
_ = list(d[v] for v in r)

2.6 ms ± 637 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [140]:
len(ls)

9987

In [151]:
# yeah, filtering the 1M objects to find the 10k keys that fit the constraints is very slow.
# Range queries should always be btree if possible.
# what about hash queries? How much worse are those on btree?
# The keys are unique in the btree... so it's kinda the same thing...?

In [153]:
# So ok. We're sure we want value-based lookups in frozen. Maybe?
# When do we use them?
# Right... when vals are comparable n stuff
# ok yeah, use the wordle thingy as your perf test on the value-version of Frozen.

In [154]:
# whoa hey, the conversion story from btree <-> dict-of-set is a pretty good one
# they both support [] so that's okay
# how much is 10K conversions, you think?

In [155]:
d = {random.random(): random.random() for _ in range(10000)}

In [163]:
%%timeit
bt = OOBTree()
for k, v in d.items():
    bt[k] = v

# duuuuuuude. We can *do* this. Changing it over on a single add is easy. 4ms for an insert is *nothing*!

3.74 ms ± 75.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [162]:
# Proposed design...
# Frozen:
# - it's an array-as-tree-of-set if the values are comparable. 
# - Otherwise fail over to arrays-as-dict-of-set.
# Mutable:
# - Starts life as dict-of-set
# - When 10K unique keys reached, try converting to BTree.
#   - If it fails, never try again; these values are too weird.
#   - If it succeeds, it's now a BTree, even if values are added / removed later.
#   - If it's a BTree, and a new un-comarable value arrives, what happens? Convert back to dict? Or raise TypeError?

In [175]:
_="""
But what does all that *do* exactly?
- Makes range queries good
- Increases code complexity
- Consider: Do we just want BTrees everywhere? 500ns vs 50ns per key lookup may be 10x, but it's still
a small part of our overall cost. Doing intersections and id->obj lookups are what eats the real milliseconds. 
- I am emotionally attached to the hash implementation. Sunk cost. But I bet 0 of the tests would fail on a 
tree-based implementation. 
- Also having a bad brain day
- Also overwhelmed by the idea of replacing so much code at once
  - Maybe needs to be cut up smaller to feel doable
- Is it really so bad to just replace `dict` with `OOBTree`? 
"""

In [165]:
from filterbox import FilterBox

In [169]:
sinks = [{'x': round(z['x']*10)} for z in floats]

In [168]:
%%time
fb = FilterBox(floats, on='x')
# it's slow because so many distincts. Sigh.

CPU times: user 2.3 s, sys: 83.2 ms, total: 2.38 s
Wall time: 2.38 s


In [170]:
%%time
fb = FilterBox(sinks, on='x')
# yeah see that's faster. Also insertion into the BTree is gonna be slow with add().

CPU times: user 1.34 s, sys: 50 ms, total: 1.39 s
Wall time: 1.39 s


In [174]:
%%time
bt = OOBTree()
for s in sinks:
    add(bt, s['x'], id(s))
# what that's not fair how is this so much better

CPU times: user 684 ms, sys: 15 µs, total: 684 ms
Wall time: 683 ms


In [173]:
len(bt[0])

49585

In [186]:
bt = OOBTree()
bt.insert('Eevee', 1)
bt.insert('Pikachu', 2)
bt.insert('Zapdos', 3)
bt.insert('dd', 4)
bt.insert('e', 5)

list(bt.values('Eevee', 'Zapdos'))

[1, 2, 3]