In [1]:
from __future__ import print_function
import numpy as np
import numba
import awkward

np.random.seed(4)
nrows = 50000

In [2]:
def comb(counts, content):
    out_counts = counts*(counts-1)//2
    out = np.empty((np.sum(out_counts), ) + content.shape[1:])
    i = 0
    n = 0
    for c in counts:
        for i1 in range(i, i+c):
            for i2 in range(i1+1, i+c):
                out[n] = content[i1]+content[i2]
                n += 1
        i += c
    return (out_counts, out)

comb_fast = numba.njit(comb)

def awk_comb(awk):
    c = awk.pairs(same=False)
    return c["0"] + c["1"]

def py_comb(awk):
    counts, content = comb(awk.counts, awk.content)
    return awkward.JaggedArray.fromcounts(counts, content)

def py_comb_fast(awk):
    counts, content = comb_fast(awk.counts, awk.content)
    return awkward.JaggedArray.fromcounts(counts, content)

In [3]:
nwide = 4
counts = np.minimum(np.random.exponential(2, size=nrows).astype(int), 20)
content = np.random.normal(size=np.sum(counts)*nwide).reshape((-1, nwide))
print(content.shape)

awk_vector = awkward.JaggedArray.fromcounts(counts, content)

(77794, 4)


In [4]:
# force pre-compilation of numba functions
_ = py_comb_fast(awk_vector)

%timeit awk_comb(awk_vector)
%timeit py_comb(awk_vector)
%timeit py_comb_fast(awk_vector)
%timeit comb_fast(awk_vector.counts, awk_vector.content)

10 loops, best of 3: 19.9 ms per loop
10 loops, best of 3: 143 ms per loop
100 loops, best of 3: 13.2 ms per loop
100 loops, best of 3: 13.1 ms per loop


In [5]:
content = np.random.normal(size=np.sum(counts))
print(content.shape)

awk_scalar = awkward.JaggedArray.fromcounts(counts, content)

(77794,)


In [6]:
# force pre-compilation of numba functions
_ = py_comb_fast(awk_scalar)

%timeit awk_comb(awk_scalar)
%timeit py_comb(awk_scalar)
%timeit py_comb_fast(awk_scalar)
%timeit comb_fast(awk_scalar.counts, awk_scalar.content)

10 loops, best of 3: 17.6 ms per loop
10 loops, best of 3: 79.1 ms per loop
1000 loops, best of 3: 732 µs per loop
1000 loops, best of 3: 568 µs per loop


In [29]:
pairs = awk_vector[awk_scalar>2].pairs(same=False)
psum = np.sum((pairs["0"]+pairs["1"]).flatten(), axis=1)
psum.shape

(70,)