In [8]:
import numpy as np
from typing import Tuple, Union, Callable, Any, Iterable
from hashindex.utils import get_field
import random
from hashindex import FrozenHashIndex
from cykhash import Int64Set
import time
from bisect import bisect_left


def get_field(obj, field):
    if callable(field):
        val = field(obj)
    elif isinstance(obj, dict):
        val = obj.get(field, None)
    else:
        val = getattr(obj, field, None)
    return val


SIZE_THRESH = 100

In [24]:

def sort_by_hash(
    objs: Iterable[Any], field: Union[Callable, str]
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Sort objs and vals by vals.

    Takes 450ms for 1M objs on a numeric field. May take longer if field is a Callable or is hard to hash.
    Breakdown:
     - 100ms to do all the get_field() calls. Cost is the part that inspects each obj to see if it's a dict.
     - 220ms to get and hash the field for each obj. No getting around that.
     - 100ms to sort the hashes
     - 30ms of whatever
    """
    hash_arr = np.empty(len(objs), dtype='int64')
    val_arr = np.empty(len(objs), dtype='O')
    obj_arr = np.array(objs, dtype='O')
    for i, o in enumerate(objs):
        val_arr[i] = get_field(o, field)
        hash_arr[i] = hash(val_arr[i])
        objs[i] = get_field(o, field)
    sort_order = np.argsort(hash_arr)
    val_arr = val_arr[sort_order]
    obj_arr = obj_arr[sort_order]
    return hash_arr, val_arr, obj_arr


def group_by_val(hash_arr: np.ndarray, val_arr: np.ndarray, obj_arr: np.ndarray):
    """Modifies val_arr and obj_arr so that they group elements having the same value.

    Does not modify hash_arr, as we won't need it past this point.

    """
    def _group_by_val_same_hash(val_arr, obj_arr, p0, p1):
        """Does group_by for a subarray all having the same hash but containing >=2 distinct values.

        Normal tools for doing group_by fail here.
        - We can't assume values are sortable, so can't just sort the values and find change points.
        - We are grouping values that have the same hash, so dict() will be inefficient.

        So just making a list for each distinct value and appending the indices to it will work.
        That will be O(n*k), where k = num of distinct values.
        Luckily, we don't expect too many distinct values with the same hash.
        Having more than two hashes colliding probably means the user is doing something funky, and bad
        performance is ok in that case.
        """
        distinct_vals = []
        val_idx_lists = []  # list of list of indices. All elements in the inner list have the same val.
        for i in range(p0, p1):
            try:
                idx = distinct_vals.index(val_arr[i])
                val_idx_lists[idx].append(i)
            except ValueError:
                distinct_vals.append(val_arr[i])
                val_idx_lists.append([i])

        # concat the val_idx_lists to make one array of indices, like how argsort output looks
        sort_idxs = []
        for ixl in val_idx_lists:
            sort_idxs.extend(ixl)

        # now apply that to each array inplace
        val_arr[p0:p1] = val_arr[sort_idxs]
        obj_arr[p0:p1] = obj_arr[sort_idxs]

    mismatch_hash = hash_arr[1:] != hash_arr[:-1]
    hash_change_pts = np.append(np.where(mismatch_hash), len(hash_arr) - 1)
    p0 = 0
    for end_i in hash_change_pts:
        p1 = end_i + 1
        if p1-p0 > 1:
            v = val_arr[p0]
            non_v_values = np.where(val_arr[p0+1:p1] != v)
            if len(non_v_values):  # False unless there's a hash collision
                _group_by_val_same_hash(val_arr, obj_arr, p0, p1)
        p0 = p1


def run_length_encode(val_arr: np.ndarray):
    """
    Find counts of each val in the val_arr (sorted) via run-length encoding.

    Takes 10ms for 1M objs.
    """
    mismatch_val = val_arr[1:] != val_arr[:-1]
    change_pts = np.append(np.where(mismatch_val), len(val_arr) - 1)
    counts = np.diff(np.append(-1, change_pts))
    starts = np.cumsum(np.append(0, counts))[:-1]
    return starts, counts, val_arr[change_pts]


def compute_mutable_dict(objs, field):
    """Create a dict of {val: obj_ids}. Used when creating a mutable index."""
    sorted_hashes, sorted_vals, sorted_objs = hash_and_sort(objs, field)
    group_by_val(sorted_hashes, sorted_vals, sorted_objs)
    starts, counts, unique_vals = run_length_encode(sorted_vals)
    d = dict()
    for i, v in enumerate(unique_vals):
        start = starts[i]
        count = counts[i]
        if counts[i] > SIZE_THRESH:
            d[v] = Int64Set(id(obj) for obj in sorted_objs[start:start+count])
        else:
            d[v] = tuple(id(obj) for obj in sorted_objs[start:start+count])
    return d


In [25]:

class Collider:

    VALS = list(range(10))

    def __init__(self):
        self.n = random.choice(self.VALS)

    def __hash__(self):
        return self.n % 2

    def __eq__(self, other):
        return self.n == other.n
    
    def __str__(self):
        return str(self.n)


In [28]:
objs = [Collider() for _ in range(20)]
# objs = [{'n': i%2} for i in range(10)]
field = 'n'

sorted_hashes, sorted_vals, sorted_objs = sort_by_hash(objs, field)


In [29]:
sorted_vals

array([0, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 9, 9, 9],
      dtype=object)

In [30]:
group_by_val(sorted_hashes, sorted_vals, sorted_objs)
sorted_vals

array([0, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 9, 9, 9],
      dtype=object)

In [24]:
objs = [random.random() for _ in range(10**6)]
rarr = np.array(objs)

In [25]:
%%timeit -n 3 -r 3
np.argsort(rarr)

109 ms ± 2.1 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [26]:
sort_order = np.argsort(rarr)

In [27]:
%%timeit -n 3 -r 3
x = rarr[sort_order]

6.58 ms ± 1.8 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [2]:
objs = [{'a': random.random()} for _ in range(10**6)]

In [60]:
from hashindex import FrozenHashIndex

t0 = time.time()
fh = FrozenHashIndex(objs, ['a'])
print(time.time()-t0)
# wow omg what trash 4.4 jfc
# yeah you made separate lookups for each value didn't you
# instead of ranging the little shits
# well, could be worse... 2.0 at 0.5.2
# and 2.0 at 0.5.1?
# and 1.8 at 0.5.0
# dude this idea is just slowwwww
# can you make it dumber somehow

1.2211081981658936


In [20]:
a = [round(obj['a']*10**6) for obj in objs]

In [9]:
%%timeit
bisect_left(a, 0.5)

274 ns ± 1.52 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [10]:
# ok that is a GREAT idea.
# don't prebuild the tinies, just bisect them out of the big array.


In [58]:
# just doing a big dict-of-set takes > 1 second
# are we adding value on top of that that's worth slightly more time?
# I think so, the RAM usage is pretty good. Don't despair.
# 2 seconds is really fast here. You are doing great.
t0 = time.time()
d = dict()
prev = None
a = list(sorted(a))
i = 0
while i < len(a):
    a_list = []
    if i==0:
        v = a[0]
        prev_v = v
    while v == prev_v:
        a_list.append(v)
        prev_v = v
        i += 1
        if i == len(a):
            break
        v = a[i]
    prev_v = v
    if len(a_list) < 100:
        d[a_list[0]] = tuple(a_list)
    else:
        d[a_list[0]] = Int64Set(a_list)
t1 = time.time()
print(t1-t0)
print(sum([len(v) for v in d.values()]))

1.0137348175048828
1000000


In [59]:
from pympler.asizeof import asizeof
asizeof(d) / 10**6

86.2484