In [1]:
_="""
Can we make a HashBox on 10,000 fields? Like, reverse index on doc set?
POC: make "documents" of 3-letter random lowercase strings (17576 possibilities)
and make a HashBox for counts of each word.

Answer: It's possible but bad. A sparse matrix representation is far better in this case.
A basic attempt to sparse-ify this did not improve performance much. Just the wrong tool for that job. 
"""

In [2]:
from hashbox import FrozenHashBox
import random
import string
from functools import partial

In [3]:
def rand_word():
    return ''.join([random.choice(string.ascii_lowercase) for _ in range(3)])


def rand_doc():
    return [rand_word() for _ in range(10)]

docs = [rand_doc() for _ in range(1000)]
docs[0]

['prb', 'xzs', 'xlk', 'fwm', 'rcf', 'szf', 'dfh', 'kck', 'gls', 'kry']

In [4]:
all_words = set()
for d in docs:
    for w in d:
        all_words.add(w)

In [5]:
def word_count(w, doc):
    return doc.count(w)

def make_word_count_function(w):
    p = partial(word_count, w)
    p.__name__ = f'count_{w}'
    return p

w_count_fns = {word: make_word_count_function(word) for word in all_words}

In [6]:
import time
t0 = time.time()
hi = FrozenHashBox(docs, on=list(w_count_fns.values()))
t1 = time.time()
print(t1-t0)

7.700803518295288


In [7]:
len(all_words)

7683

In [13]:
s = next(iter(all_words))

In [14]:
hi.find({w_count_fns[s]:1})

array([list(['pkn', 'uqz', 'wox', 'jgs', 'pfr', 'avb', 'ydl', 'awe', 'qil', 'btn'])],
      dtype=object)