In [1]:
from dataclasses import dataclass
from bisect import bisect_left
import random
import string
import time
import numpy as np
from sortedcontainers import SortedDict
from pympler.asizeof import asizeof
import sortednp as snp
from operator import itemgetter
from typing import List, Optional, Any
from cykhash import Int64Set

### How long does it take to look up 1M objects in a large dictionary?

In [51]:
large = 10**7

# Even at small n_gets, the sortednp lookup is much faster thanks to galloping search.
n_get = 10**6

In [52]:
# make a dict of size 'large'
rand_ints = [int(random.random()*10**12) for _ in range(large)]
d = {rand_ints[i]: 'a' for i in range(large)}
len(d)

9999949

In [53]:
%%timeit -n 5 -r 5
# get 1M of them (list)
_ = [d[ri] for ri in rand_ints[-n_get:]]

352 ms ± 4.51 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [92]:
%%timeit -n 5 -r 5
# get 1M of them (itemgetter)
_ = itemgetter(*rand_ints[-n_get:])(d))

SyntaxError: unexpected EOF while parsing (<unknown>, line 2)

In [82]:
%%timeit -n 5 -r 5
# get 1M of them as np array (itemgetter)
_ = np.array(itemgetter(*rand_ints[-n_get:])(d))

407 ms ± 6.39 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


### How long does it take to get 1M items from a large sorted numpy array?

In [64]:
arr = np.array(range(large))
items_ls = ['a' for _ in range(large)]
items_np = np.array(items_ls)
sub = np.random.choice(arr, size=(n_get,), replace=False)
sub = np.sort(sub)
sub[:10]

array([  4,  16,  30,  37,  43,  69,  73,  97,  98, 107])

In [69]:
%%timeit -n 5 -r 5
# get as numpy array
idxs = snp.intersect(sub, arr, indices=True)[1][1]
_ = items_np[idxs]

34.4 ms ± 1.9 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [87]:
%%timeit -n 5 -r 5
# get as list, using itemgetter on items_ls
idxs = snp.intersect(sub, arr, indices=True)[1][1]
_ = list(itemgetter(*idxs)(items_ls))

100 ms ± 2.89 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [88]:
%%timeit -n 5 -r 5
# get as list, using generator on list version
idxs = snp.intersect(sub, arr, indices=True)[1][1]
_ = list(items_ls[i] for i in idxs)

118 ms ± 4.43 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [72]:
%%timeit -n 5 -r 5
# get as list, converted from numpy array
idxs = snp.intersect(sub, arr, indices=True)[1][1]
_ = list(items_np[idxs])

239 ms ± 6.23 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [79]:
%%timeit -n 5 -r 5
# get as list, using generator on np version
idxs = snp.intersect(sub, arr, indices=True)[1][1]
_ = list(items_np[i] for i in idxs)

357 ms ± 10.6 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [76]:
%%timeit -n 5 -r 5
# get as list, using itemgetter on items_np
idxs = snp.intersect(sub, arr, indices=True)[1][1]
_ = list(itemgetter(*idxs)(items_np))

346 ms ± 7.47 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [90]:
# just looking at the output here to confirm it's doing the thing
idxs = snp.intersect(sub, arr, indices=True)[1][1]
arr_found = items_np[idxs]
print(idxs[:10], arr_found[:10])

[  4  16  30  37  43  69  73  97  98 107] ['a' 'a' 'a' 'a' 'a' 'a' 'a' 'a' 'a' 'a']


In [91]:
%%timeit -n 5 -r 5
# literally just converting a numpy obj array takes 200ms, ugh
_ = list(arr_found)

203 ms ± 2.74 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


### How long does it take to sort a big numpy array?

In [9]:
some_data = np.array(range(8*10**6))  # we have some sorted data
more_data = np.array(range(2*10**6))  # add a bunch of unsorted at the end
np.random.shuffle(more_data)
big_arr = np.concatenate([some_data, more_data])
len(big_arr)

10000000

In [10]:
%%timeit -n 5 -r 5
# we can sort ~10M items in 1 second. Bit faster if most of them are sorted first, but 1 second generally.
# With a lot of indices, that could get quite expensive in time. A billion-item dataset with 10 indices is
# just not gonna be performant here.
# Doing it on freeze() is OK, doing it on mutation (or deferred to on-query) is not. 
z = np.argsort(big_arr)

514 ms ± 9.03 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


### What's the fastest data structure to run intersect / union on? 
 - set 
 - cykhash set
 - sortednp

In [27]:
exp = 7
sub_exp = exp - 1  # change this to -2 etc. You'll see changes, but relative winners / losers are the same.
ids_1 = list(range(10**exp))
ids_2 = list(range(10**exp-10**sub_exp, 10**exp))

# sorted
np_1 = np.array(ids_1)
np_2 = np.array(ids_2)

# not sorted
random.shuffle(ids_1)
random.shuffle(ids_2)
set_1 = set(ids_1)
set_2 = set(ids_2)
cyk_1 = Int64Set(ids_1)
cyk_2 = Int64Set(ids_2)


#### Intersections

In [28]:
%%timeit -n 5 -r 5
# gotta flip manually - cykhash doesn't do this for you
# note that attempting cyk_1, cyk_2 = cyk_2, cyk_1 breaks in strange ways
if len(cyk_1) < len(cyk_2):
    _ = cyk_1.intersection(cyk_2)
else:
    _ = cyk_2.intersection(cyk_1)

130 ms ± 898 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [29]:
%%timeit -n 5 -r 5
# see, it's slower the other way
if len(cyk_1) > len(cyk_2):
    _ = cyk_1.intersection(cyk_2)
else:
    _ = cyk_2.intersection(cyk_1)

441 ms ± 2.17 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [30]:
%%timeit -n 5 -r 5
_ = set_1.intersection(set_2)

42.5 ms ± 1.18 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [31]:
%%timeit -n 5 -r 5
_ = snp.intersect(np_1, np_2, indices=True)

7.52 ms ± 282 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


#### Unions

In [32]:
%%timeit -n 5 -r 5
# for union, it's better if the bigger set is the "base"
if len(cyk_1) > len(cyk_2):
    _ = cyk_1.union(cyk_2)
else:
    _ = cyk_2.union(cyk_1)

393 ms ± 6.32 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


## Design

There are several configurations. Probably we want two of them. One for mutable case, one for frozen case.

Sorted numpy arrays are great but there's no way to make mutability work with them. On high-cardinality fields where you'd want them the most, they are gonna need re-sorting all the time. On low-card they don't help much. So they're frozen-only.

### Goals:
 - Fast add / remove / update in mutable case
 - Fast lookup (small number of returned items)
 - Fast simple lookup (single-term match, no exclude, 1M+ items)
 - Fast near-simple lookup (get many items, remove a few of them)
 - RAM efficiency (low cardinality)
 - RAM efficiency (high cardinality / in general)
 - RAM efficiency (many indices)

### Mutable case

1. Nothing bucketed
 - Current implementation
 - one cykhash set of obj_ids per field value
 - global dict of {obj_id: obj}
 - Bad for low-cardinality field values
 - Lookups are slow when a simple match hits 1M items

2. Bucketed field values, buckets have obj_id sets. Global dict lookup.
 - one cykhash set of obj_ids per bucket of field values
 - global dict of {obj_id: obj}
 - Good for low-cardinality field values
 - A bit slower to build than current implementation
 - Lookups are slow when a simple match hits 1M items

3. Bucketed field values, buckets have obj_id sets. Uhh, I don't think there's a way to pre-store obj lists in the buckets. Dicts are way too RAM-expensive, and anything else would have bad mutability. So yeah, it's 2 or what we have already.


### Frozen case

1. Nothing bucketed
 - Current implementation
 - one numpy array of obj_ids per field value
 - global dict of {obj_id: obj}
 - Bad for low-cardinality field values
 - Lookups are slow (330ms) when a simple match hits 1M items


2. Bucketed field values, buckets have obj_id arrays. Global np array pair lookup.
 - one numpy array of obj_ids per bucket of field values
 - global sorted parallel arrays of obj_id and obj
 - Good for low-cardinality field values
 - A bit slower to build than current implementation
 - Lookups are tolerable (34ms) when a simple match hits 1M items


3. Bucketed field values, buckets have obj_id arrays and obj arrays. 
 - Each field index contains a full copy of all obj_ids and all obj_id arrays and obj arrays (16 bytes / item / index).
 - Good for low-cardinality field values
 - Complex intersect / union logic
 - Same build time as (2)
 - Lookups are basically instant (0ms) when a simple match hits 1M items


In [33]:
%%timeit -n 5 -r 5
# see? this is the wrong way
if len(cyk_1) < len(cyk_2):
    _ = cyk_1.union(cyk_2)
else:
    _ = cyk_2.union(cyk_1)

2.88 s ± 177 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [34]:
%%timeit -n 5 -r 5
_ = set_1.union(set_2)

295 ms ± 12.9 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [35]:
%%timeit -n 5 -r 5
_ = snp.merge(np_1, np_2, duplicates=snp.DROP)

37.7 ms ± 628 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)
