Interesting and quite simple problem to cound 1 bits in buffer.

I think you might get different results if your data is not long enough. I got these from some links and then modified to use numpy and bytearray. These are generally untested.

http://www.valuedlessons.com/2009/01/popcount-in-python-with-benchmarks.html

http://stackoverflow.com/questions/9829578/fast-way-of-counting-bits-in-python

http://blog.philippklaus.de/2014/10/counting-bits-set-to-1-in-bytes-with-python-popcount-or-hamming-weight/

https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation

http://www.expobrain.net/2013/07/29/hamming-weights-python-implementation/

http://stackoverflow.com/questions/8220801/how-to-use-timeit-module

http://graphics.stanford.edu/~seander/bithacks.html

Clearly best algorithm depends on data and CPU you currently have. Is data 32-bit, 64-bit or big blob of bytearray?
I think it clearly matters how data is arranged. Can you use big numpy darrays or only small arrays or no array at all?

In [1]:
import random, struct
import numpy as np
import gmpy2
from gmpy2 import mpz
# not meant to be random
random.seed(1)
d = bytearray([random.randint(0,255) for i in range(4096)])
print(len(d))
v = np.frombuffer(d, dtype=np.uint32)
print(v.shape, v[0])
v = np.frombuffer(d, dtype=np.uint64)*mpz(1)
print(v.shape, v[0], gmpy2.popcount(v[0]))

def count1s64(d):
    v = np.frombuffer(d, dtype=np.uint64)
    v = np.bitwise_and(v, 0x5555555555555555) + np.right_shift(np.bitwise_and(v, 0xAAAAAAAAAAAAAAAA), 1)
    v = np.bitwise_and(v, 0x3333333333333333) + np.right_shift(np.bitwise_and(v, 0xCCCCCCCCCCCCCCCC), 2)
    v = np.bitwise_and(v, 0x0F0F0F0F0F0F0F0F) + np.right_shift(np.bitwise_and(v, 0xF0F0F0F0F0F0F0F0), 4)
    v = np.bitwise_and(v, 0x00FF00FF00FF00FF) + np.right_shift(np.bitwise_and(v, 0xFF00FF00FF00FF00), 8)
    v = np.bitwise_and(v, 0x0000FFFF0000FFFF) + np.right_shift(np.bitwise_and(v, 0xFFFF0000FFFF0000), 16)
    v = np.bitwise_and(v, 0x00000000FFFFFFFF) + np.right_shift(v, 32)
    return v.sum()
v = np.frombuffer(d, dtype=np.uint64)
print(count1s64(d))

4096
(1024,) 1015160900
(512,) 14047262688061562948 29
16203


In [2]:
import numpy as np
import gmpy2
from gmpy2 import mpz

import random, struct

class popcount:
    TABLE16 = [0] * 2**16
    for index in range(len(TABLE16)):
        POPCOUNT_TABLE16p[index] = (index & 1) + TABLE16[index >> 1]

    TABLE16 = np.zeros(2**16, dtype=int) #has to be an array

    for index in range(len(TABLE16)):
        TABLE16[index] = (index & 1) + TABLE16[index >> 1]

    def popcount32_table16(v):
        return (popcount.POPCOUNT_TABLE16[ v        & 0xffff] +
                popcount.POPCOUNT_TABLE16[(v >> 16) & 0xffff])

    def popcount64_table16(v):
        return (popcount.POPCOUNT_TABLE16[ v        & 0xffff] +
                popcount.POPCOUNT_TABLE16[(v >> 16) & 0xffff] +
                popcount.POPCOUNT_TABLE16[(v >> 32) & 0xffff] +
                popcount.POPCOUNT_TABLE16[(v >> 48) & 0xffff])

    def count1s_lut16_32(d):
        v = np.frombuffer(d, dtype=np.uint32)
        return popcount.popcount32_table16(v).sum()

    def count1s_lut16_64(d):
        v = np.frombuffer(d, dtype=np.uint64)
        return popcount.popcount64_table16(v).sum()

    POPCOUNT_TABLE16b = np.zeros(2**16, dtype=np.ubyte) #has to be an array

    for index in range(len(POPCOUNT_TABLE16b)):
        POPCOUNT_TABLE16b[index] = ((index & 1) + POPCOUNT_TABLE16b[index >> 1]) & 0xff

    def popcount32_table16b(v):
        return (popcount.POPCOUNT_TABLE16b[ v        & 0xffff] +
                popcount.POPCOUNT_TABLE16b[(v >> 16) & 0xffff])

    def popcount64_table16b(v):
        return (popcount.POPCOUNT_TABLE16b[ v        & 0xffff] +
                popcount.POPCOUNT_TABLE16b[(v >> 16) & 0xffff] +
                popcount.POPCOUNT_TABLE16b[(v >> 32) & 0xffff] +
                popcount.POPCOUNT_TABLE16b[(v >> 48) & 0xffff])

    def count1s_lut16b_32(d):
        v = np.frombuffer(d, dtype=np.uint32)
        return popcount.popcount32_table16b(v).sum()

    def count1s_lut16b_64(d):
        v = np.frombuffer(d, dtype=np.uint64)
        return popcount.popcount64_table16b(v).sum()

    m1  = 0x5555555555555555
    m2  = 0x3333333333333333
    m4  = 0x0f0f0f0f0f0f0f0f
    m8  = 0x00ff00ff00ff00ff
    m16 = 0x0000ffff0000ffff
    m32 = 0x00000000ffffffff
    h01 = 0x0101010101010101

    def count1s_bw_64(d):
        v = np.frombuffer(d, dtype=np.uint64)
        v = np.bitwise_and(v, popcount.m1) + np.right_shift(np.bitwise_and(v, 0xAAAAAAAAAAAAAAAA), 1)
        v = np.bitwise_and(v, popcount.m2) + np.right_shift(np.bitwise_and(v, 0xCCCCCCCCCCCCCCCC), 2)
        v = np.bitwise_and(v, popcount.m4) + np.right_shift(np.bitwise_and(v, 0xF0F0F0F0F0F0F0F0), 4)
        v = np.bitwise_and(v, popcount.m8) + np.right_shift(np.bitwise_and(v, 0xFF00FF00FF00FF00), 8)
        v = np.bitwise_and(v, popcount.m16) + np.right_shift(np.bitwise_and(v, 0xFFFF0000FFFF0000), 16)
        v = np.bitwise_and(v, popcount.m32) + np.right_shift(v, 32)
        return v.sum()
    
    def count1s_bw3_64(d):
        v = np.frombuffer(d, dtype=np.uint64)
        v = v - np.bitwise_and(np.right_shift(v, 1), popcount.m1)
        v = np.bitwise_and(v, popcount.m2) + np.bitwise_and(np.right_shift(v, 2), popcount.m2)
        v = np.bitwise_and(v + np.right_shift(v, 4), popcount.m4)
        v = np.right_shift(v*popcount.h01, 56)
        return v.sum()

    ma = 0x01001001001001
    mb = 0x84210842108421
    
    def count1s_bw64_32(d):
        v = np.frombuffer(d, dtype=np.uint32)
        c = (  np.mod(np.bitwise_and(np.bitwise_and(v, 0xfff) * popcount.ma, popcount.mb), 0x1f)
             + np.mod(np.bitwise_and(np.right_shift(np.bitwise_and(v, 0xfff000), 12) * popcount.ma, popcount.mb), 0x1f)
             + np.mod(np.bitwise_and(np.right_shift(v, 24) * popcount.ma, popcount.mb), 0x1f)
             )
        return c.sum()

    def count1s_gmpy2_64(d):
        v = np.frombuffer(d, dtype=np.uint64)*mpz(1)
        return sum(gmpy2.popcount(a) for a in v)

popcount_methods = [ a for a in dir(popcount) if a.startswith('count1s') ]
popcount_methods.sort()

d = bytearray([0 for i in range(4096)])
print([getattr(popcount, a)(d) for a in popcount_methods])
d = bytearray([1 for i in range(4096)])
print([getattr(popcount, a)(d) for a in popcount_methods])
d = bytearray([255 for i in range(4096)])
print([getattr(popcount, a)(d) for a in popcount_methods])

[0, 0, 0, 0, 0, 0, 0, 0]
[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
[32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768]


In [3]:
import timeit
from functools import partial
import random

count1s64_3 is my current winner for random and big byte buffers. count1sb is faster here than example because array is lot smaller and easier to handle.

In [4]:
# not meant to be random
random.seed(1)
d = bytearray([random.randint(0,255) for i in range(409600)])
number = 2000
repeat = 3

print([getattr(popcount, a)(d) for a in popcount_methods])

[1638868, 1638868, 1638868, 1638868, 1638868, 1638868, 1638868, 1638868]


In [5]:
for a in popcount_methods:
    if a.find('gmpy') != -1: continue # is >*10 lut16_32
    print(a, timeit.repeat(partial(getattr(popcount, a), d), number=number, repeat=repeat))

count1s_bw3_64 [1.4683719020104036, 1.4148216699977638, 1.4016192650015]
count1s_bw64_32 [8.37341797100089, 7.9708313930023, 8.074154420988634]
count1s_bw_64 [2.4289934129919857, 2.5376295519963605, 2.4208389060077025]
count1s_lut16_32 [2.9401849119894905, 2.9334342200018, 2.944310881997808]
count1s_lut16_64 [3.282487571006641, 3.0218097100005252, 3.066016472002957]
count1s_lut16b_32 [2.5898498320020735, 2.573576871000114, 2.5618909079930745]
count1s_lut16b_64 [3.2609805679967394, 3.2522625860001426, 3.174878998994245]


Situation is lot tighter when bytearray is small. Seems is it about as fasto to initialize couple 32-bit arrays than couple of 8-bit arrays.

In [6]:
random.seed(1)
d = bytearray([random.randint(0,255) for i in range(16)])
number = 200000
repeate = 3

print([getattr(popcount, a)(d) for a in popcount_methods])

[61, 61, 61, 61, 61, 61, 61, 61]


In [7]:
for a in popcount_methods:
    if a.find('gmpy') != -1: continue
    print(a, timeit.repeat(partial(getattr(popcount, a), d), number=number, repeat=repeat))

count1s_bw3_64 [6.049276787991403, 5.891717788006645, 5.826870687000337]
count1s_bw64_32 [7.775227784004528, 7.900869042001432, 7.896300934007741]
count1s_bw_64 [9.971313382004155, 9.747144970999216, 10.274215188008384]
count1s_lut16_32 [5.353110508003738, 5.32542279700283, 5.0078755549911875]
count1s_lut16_64 [8.285028084996156, 8.521008486000937, 8.355133912991732]
count1s_lut16b_32 [5.082212816996616, 5.206349861997296, 5.399853406008333]
count1s_lut16b_64 [9.021002337991376, 9.078560581998318, 9.17489140899852]


Another variant, where vector is initialized before.

In [48]:
class popcount_v:
    def count1s_naive(ve):
        c = 0
        for v in ve.tolist():
            while(v):
                c += v & 1
                v >>= 1
        return c
    
    def count1s_pythonic(ve):
        c = 0
        for v in ve.tolist():
            c += bin(v).count("1")
        return c

    TABLE8 = [0] * 2**8
    for index in range(len(TABLE8)):
        TABLE8[index] = (index & 1) + TABLE8[index >> 1]
    def popcount32_table8(v):
        return (popcount_v.TABLE8[ v        & 0xff ] +
                popcount_v.TABLE8[(v >>  8) & 0xff ] +
                popcount_v.TABLE8[(v >> 16) & 0xff ] +
                popcount_v.TABLE8[(v >> 24)        ])
    def popcount64_table8(v):
        return (popcount_v.TABLE8[ v        & 0xff ] +
                popcount_v.TABLE8[(v >>  8) & 0xff ] +
                popcount_v.TABLE8[(v >> 16) & 0xff ] +
                popcount_v.TABLE8[(v >> 24) & 0xff ] +
                popcount_v.TABLE8[(v >> 32) & 0xff ] +
                popcount_v.TABLE8[(v >> 40) & 0xff ] +
                popcount_v.TABLE8[(v >> 48) & 0xff ] +
                popcount_v.TABLE8[(v >> 56)        ])
    def count1s_lut8_32(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount32_table8(v)
        return c
    def count1s_lut8_64(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount64_table8(v)
        return c

    TABLE16 = [0] * 2**16
    for index in range(len(TABLE16)):
        TABLE16[index] = (index & 1) + TABLE16[index >> 1]
    def popcount32_table16(v):
        return (popcount_v.TABLE16[ v & 0xffff ] +
                popcount_v.TABLE16[ v >> 16    ])
    def popcount64_table16(v):
        return (popcount_v.TABLE16[ v        & 0xffff] +
                popcount_v.TABLE16[(v >> 16) & 0xffff] +
                popcount_v.TABLE16[(v >> 32) & 0xffff] +
                popcount_v.TABLE16[(v >> 48)         ])
    def count1s_lut16_32(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount32_table16(v)
        return c
    def count1s_lut16_64(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount64_table16(v)
        return c

    TABLE16_npi = np.zeros(2**16, dtype=int) #has to be an array
    for index in range(len(TABLE16_npi)):
        TABLE16_npi[index] = (index & 1) + TABLE16_npi[index >> 1]
    def popcount32_table16_npi(v):
        return (popcount_v.TABLE16_npi[ v & 0xffff ] +
                popcount_v.TABLE16_npi[ v >> 16    ])
    def popcount64_table16_npi(v):
        return (popcount_v.TABLE16_npi[ v        & 0xffff ] +
                popcount_v.TABLE16_npi[(v >> 16) & 0xffff ] +
                popcount_v.TABLE16_npi[(v >> 32) & 0xffff ] +
                popcount_v.TABLE16_npi[(v >> 48)          ])
    def count1s_nplut16i_32(v):
        return popcount_v.popcount32_table16_npi(v).sum()
    def count1s_nplut16i_64(v):
        return popcount_v.popcount64_table16_npi(v).sum()
    def count1s_lut16i_32(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount32_table16_npi(v)
        return c
    def count1s_lut16i_64(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount64_table16_npi(v)
        return c

    TABLE16_npb = np.zeros(2**16, dtype=np.byte) #has to be an array
    for index in range(len(TABLE16_npb)):
        TABLE16_npb[index] = (index & 1) + TABLE16_npb[index >> 1]
    def popcount32_table16_npb(v):
        return (popcount_v.TABLE16_npb[ v & 0xffff ] +
                popcount_v.TABLE16_npb[ v >> 16    ])
    def popcount64_table16_npb(v):
        return (popcount_v.TABLE16_npb[ v        & 0xffff ] +
                popcount_v.TABLE16_npb[(v >> 16) & 0xffff ] +
                popcount_v.TABLE16_npb[(v >> 32) & 0xffff ] +
                popcount_v.TABLE16_npb[(v >> 48)          ])
    def count1s_nplut16b_32(v):
        return popcount_v.popcount32_table16_npb(v).sum()
    def count1s_nplut16b_64(v):
        return popcount_v.popcount64_table16_npb(v).sum()
    def count1s_lut16b_32(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount32_table16_npb(v)
        return c
    def count1s_lut16b_64(ve):
        c = 0
        for v in ve.tolist():
            c += popcount_v.popcount64_table16_npb(v)
        return c

    m1   = 0x5555555555555555
    m1b  = 0xAAAAAAAAAAAAAAAA
    m2   = 0x3333333333333333
    m2b  = 0xCCCCCCCCCCCCCCCC
    m4   = 0x0f0f0f0f0f0f0f0f
    m4b  = 0xf0f0f0f0f0f0f0f0
    m8   = 0x00ff00ff00ff00ff
    m8b  = 0xff00ff00ff00ff00
    m16  = 0x0000ffff0000ffff
    m16b = 0xffff0000ffff0000
    m32  = 0x00000000ffffffff
    h01  = 0x0101010101010101

    def count1s_bw1a_64(ve):
        c = 0
        for v in ve.tolist():
            v = (v & popcount_v.m1 ) + ((v & popcount_v.m1b ) >> 1 )
            v = (v & popcount_v.m2 ) + ((v & popcount_v.m2b ) >> 2 )
            v = (v & popcount_v.m4 ) + ((v & popcount_v.m4b ) >> 4 )
            v = (v & popcount_v.m8 ) + ((v & popcount_v.m8b ) >> 8 )
            v = (v & popcount_v.m16) + ((v & popcount_v.m16b) >> 16)
            v = (v & popcount_v.m32) + (v >> 32)
            c += v
        return c

    def count1s_npbw1a_64(v):
        v = np.bitwise_and(v, popcount_v.m1 ) + np.right_shift(np.bitwise_and(v, popcount_v.m1b ),  1)
        v = np.bitwise_and(v, popcount_v.m2 ) + np.right_shift(np.bitwise_and(v, popcount_v.m2b ),  2)
        v = np.bitwise_and(v, popcount_v.m4 ) + np.right_shift(np.bitwise_and(v, popcount_v.m4b ),  4)
        v = np.bitwise_and(v, popcount_v.m8 ) + np.right_shift(np.bitwise_and(v, popcount_v.m8b ),  8)
        v = np.bitwise_and(v, popcount_v.m16) + np.right_shift(np.bitwise_and(v, popcount_v.m16b), 16)
        v = np.bitwise_and(v, popcount_v.m32) + np.right_shift(v, 32)
        return v.sum()
    
    def count1s_bw1b_64(ve):
        c = 0
        for v in ve.tolist():
            v = (v & popcount_v.m1 ) + ((v >>  1) & popcount_v.m1 )
            v = (v & popcount_v.m2 ) + ((v >>  2) & popcount_v.m2 )
            v = (v & popcount_v.m4 ) + ((v >>  4) & popcount_v.m4 )
            v = (v & popcount_v.m8 ) + ((v >>  8) & popcount_v.m8 )
            v = (v & popcount_v.m16) + ((v >> 16) & popcount_v.m16)
            v = (v & popcount_v.m32) + ((v >> 32) & popcount_v.m32)
            c += v
        return c

    def count1s_npbw1b_64(v):
        v = np.bitwise_and(v, popcount_v.m1 ) + np.bitwise_and(np.right_shift(v,  1), popcount_v.m1 )
        v = np.bitwise_and(v, popcount_v.m2 ) + np.bitwise_and(np.right_shift(v,  2), popcount_v.m2 )
        v = np.bitwise_and(v, popcount_v.m4 ) + np.bitwise_and(np.right_shift(v,  4), popcount_v.m4 )
        v = np.bitwise_and(v, popcount_v.m8 ) + np.bitwise_and(np.right_shift(v,  8), popcount_v.m8 )
        v = np.bitwise_and(v, popcount_v.m16) + np.bitwise_and(np.right_shift(v, 16), popcount_v.m16)
        v = np.bitwise_and(v, popcount_v.m32) + np.bitwise_and(np.right_shift(v, 32), popcount_v.m32)
        return v.sum()
    
    def count1s_bw2_64(ve):
        c = 0
        for v in ve.tolist():
            v -= (v >> 1) & popcount_v.m1
            v = (v & popcount_v.m2) + ((v >> 2) & popcount_v.m2)
            v = (v + (v >> 4)) & popcount_v.m4
            v += v >> 8
            v += v >> 16
            v += v >> 32
            c += (v & 0x7f)
        return c

    def count1s_npbw2_64(v):
        v = v - np.bitwise_and(np.right_shift(v, 1), popcount_v.m1)
        v = np.bitwise_and(v, popcount_v.m2) + np.bitwise_and(np.right_shift(v, 2), popcount_v.m2)
        v = np.bitwise_and(v + np.right_shift(v, 4), popcount_v.m4)
        v += np.right_shift(v, 8)
        v += np.right_shift(v, 16)
        v = np.bitwise_and(v + np.right_shift(v, 32), 0x7f)
        return v.sum()

    def count1s_bw3_64(ve):
        c = 0
        for v in ve.tolist():
            v -= (v >> 1) & popcount_v.m1
            v = (v & popcount_v.m2) + ((v >> 2) & popcount_v.m2)
            v = (v + (v >> 4)) & popcount_v.m4
            v = (v * popcount_v.h01 & 0xffffffffffffffff) >> 56
            c += v
        return c

    def count1s_npbw3_64(v):
        v = v - np.bitwise_and(np.right_shift(v, 1), popcount_v.m1)
        v = np.bitwise_and(v, popcount_v.m2) + np.bitwise_and(np.right_shift(v, 2), popcount_v.m2)
        v = np.bitwise_and(v + np.right_shift(v, 4), popcount_v.m4)
        v = np.right_shift(v * popcount_v.h01, 56)
        return v.sum()

    def count1s_zero_64(ve):
        c = 0
        for x in ve.tolist():
            while x:
                x &= x - 1
                c += 1
        return c

    ma = 0x01001001001001
    mb = 0x84210842108421
    
    def count1s_bw64_32(ve):
        c = 0
        for v in ve.tolist():
            c += ((((v & 0xfff) * popcount_v.ma & popcount_v.mb) % 0x1f)
                  + ((((v & 0xfff000) >> 12) * popcount_v.ma & popcount_v.mb) % 0x1f)
                  + (((v >> 24) * popcount_v.ma & popcount_v.mb) % 0x1f))
        return c

    def count1s_npbw64_32(v):
        c = (  np.mod(np.bitwise_and(np.bitwise_and(v, 0xfff) * popcount_v.ma, popcount_v.mb), 0x1f)
             + np.mod(np.bitwise_and(np.right_shift(np.bitwise_and(v, 0xfff000), 12) * popcount_v.ma, popcount_v.mb), 0x1f)
             + np.mod(np.bitwise_and(np.right_shift(v, 24) * popcount_v.ma, popcount_v.mb), 0x1f)
             )
        return c.sum()

    def count1s_gmpy2(v):
        v2 = v*mpz(1)
        return sum(gmpy2.popcount(a) for a in v2)

popcount_v_methods = [ a for a in dir(popcount_v) if a.startswith('count1s') ]
popcount_v_methods.sort()
popcount_v_methods

['count1s_bw1a_64',
 'count1s_bw1b_64',
 'count1s_bw2_64',
 'count1s_bw3_64',
 'count1s_bw64_32',
 'count1s_gmpy2',
 'count1s_lut16_32',
 'count1s_lut16_64',
 'count1s_lut16b_32',
 'count1s_lut16b_64',
 'count1s_lut16i_32',
 'count1s_lut16i_64',
 'count1s_lut8_32',
 'count1s_lut8_64',
 'count1s_naive',
 'count1s_npbw1a_64',
 'count1s_npbw1b_64',
 'count1s_npbw2_64',
 'count1s_npbw3_64',
 'count1s_npbw64_32',
 'count1s_nplut16b_32',
 'count1s_nplut16b_64',
 'count1s_nplut16i_32',
 'count1s_nplut16i_64',
 'count1s_pythonic',
 'count1s_zero_64']

In [49]:
da = (4, 8, 16)
numa = (0, 1, 255)
for j in range(len(da)):
    ra = (0*da[j], 1*da[j], 8*da[j])
    for i in range(len(numa)):
        num = numa[i]
        d = bytearray([num for i in range(da[j])])
        l = []
        for a in popcount_v_methods:
            if a.endswith('_64') and da[j] < 8: continue
            if a.endswith('_32') or da[j] < 8:
                v = np.frombuffer(d, dtype=np.uint32)
            else:                
                v = np.frombuffer(d, dtype=np.uint64)
            r = getattr(popcount_v, a)(v)
            l.append({a: r})
            if r != ra[i]:
                print("ERROR:", a, numa[i], r, "!=", ra[i])
        #print(num, l)

Run through different vector sizes.

In [50]:
da = (4,      8,      16,     1024,  4096,  409600)
na = (500000, 500000, 100000, 10000, 10000, 1000)
for i in range(len(da)):
    random.seed(1)
    d = bytearray([random.randint(0,255) for i in range(da[i])])
    number = na[i]
    repeat = 3
    l = []
    res = 0
    for a in popcount_v_methods:
        if a.endswith('_64') and da[i] < 8: continue
        if a.endswith('_32') or da[i] < 8:
            v = np.frombuffer(d, dtype=np.uint32)
        else:
            v = np.frombuffer(d, dtype=np.uint64)
        r = getattr(popcount_v, a)(v)
        if res == 0: res = r
        if res != r:
            print("ERROR:", a, i, r, "!=", res)
        l.append({a: r})
    print(da[i], l)

4 [{'count1s_bw64_32': 9}, {'count1s_gmpy2': 9}, {'count1s_lut16_32': 9}, {'count1s_lut16b_32': 9}, {'count1s_lut16i_32': 9}, {'count1s_lut8_32': 9}, {'count1s_naive': 9}, {'count1s_npbw64_32': 9}, {'count1s_nplut16b_32': 9}, {'count1s_nplut16i_32': 9}, {'count1s_pythonic': 9}]
8 [{'count1s_bw1a_64': 29}, {'count1s_bw1b_64': 29}, {'count1s_bw2_64': 29}, {'count1s_bw3_64': 29}, {'count1s_bw64_32': 29}, {'count1s_gmpy2': 29}, {'count1s_lut16_32': 29}, {'count1s_lut16_64': 29}, {'count1s_lut16b_32': 29}, {'count1s_lut16b_64': 29}, {'count1s_lut16i_32': 29}, {'count1s_lut16i_64': 29}, {'count1s_lut8_32': 29}, {'count1s_lut8_64': 29}, {'count1s_naive': 29}, {'count1s_npbw1a_64': 29}, {'count1s_npbw1b_64': 29}, {'count1s_npbw2_64': 29}, {'count1s_npbw3_64': 29}, {'count1s_npbw64_32': 29}, {'count1s_nplut16b_32': 29}, {'count1s_nplut16b_64': 29}, {'count1s_nplut16i_32': 29}, {'count1s_nplut16i_64': 29}, {'count1s_pythonic': 29}, {'count1s_zero_64': 29}]
16 [{'count1s_bw1a_64': 61}, {'count1s_

In [None]:
for i in range(len(da)):
    random.seed(1)
    d = bytearray([random.randint(0,255) for i in range(da[i])])
    number = na[i]
    repeat = 3
    for a in popcount_v_methods:
        if a.find('gmpy') != -1: continue
        if a.endswith('_64') and da[i] < 8: continue
        if a.endswith('_32') or da[i] < 8:
            v = np.frombuffer(d, dtype=np.uint32)
        else:
            v = np.frombuffer(d, dtype=np.uint64)
        r = timeit.repeat(partial(getattr(popcount_v, a), v), number=number, repeat=repeat)
        ra = max(da[i]*na[i] / np.array(r))
        if ra > 1024*1024*1.5:
            ra = "{0:.2f}MiB/s".format(ra/1024/1024)
        elif ra > 1024*1.5:
            ra = "{0:.2f}kiB/s".format(ra/1024)
        elif ra > 1.5:
            ra = "{0:.2f}B/s".format(ra)
        print("{:20s} {:6d} {:6d} {:>12s}".format(a, da[i], na[i], ra), r)


count1s_bw64_32           4 500000    1.88MiB/s [1.2127806330099702, 1.015080700017279, 1.064274738979293]
count1s_lut16_32          4 500000    3.02MiB/s