In [1]:
import math
import struct
import random


In [2]:
def float_to_int_bits(f):
    """Convert a floating-point number to its bit representation as an integer."""
    return int.from_bytes(bytearray(struct.pack('>f', f)), byteorder='big')

# h

In [3]:
def h(value, t, a, b):
    """Hash function for float numbers that maps uniformly to the range {1, 2, ..., t}."""
    
    int_value = float_to_int_bits(value)
    
    hashed_value = (a * int_value + b) % t 

    return hashed_value

# g

In [4]:
def g(value, p, a, b, c, d):
    """Polynomial hash function for float numbers that maps uniformly to {-1, +1}."""

    int_value = float_to_int_bits(value)

    result = (
        a * pow(int_value, 3, p) + b * pow(int_value, 2, p) + c * int_value + d
    ) % p
    return 2 * (result % 2) - 1

In [12]:

def ams_sketch(vector, t, d):
    # d rows of length t
    sketch = [[0 for _ in range(t)] for _ in range(d)]
    
    hash_funcs = []
    p = 70379 # one p for all g hash funcs
    
    # create d (h, g) hash function pairs.
    for _ in range(d):
        a_h = random.randint(1, 2 ** 32 - 1)
        b_h = random.randint(0, 2 ** 32 - 1)
        
        a_g = random.randint(1, p - 1)
        b_g = random.randint(1, p - 1)
        c_g = random.randint(1, p - 1)
        d_g = random.randint(1, p - 1)
               
        hash_funcs.append((
            lambda value: h(value, t, a_h, b_h), 
            lambda value: g(value, p, a_g, b_g, c_g, d_g)
        ))
    
    
    for v_i in vector:
        for i, (h_hash, g_hash) in enumerate(hash_funcs): # for each hash row
            sketch[i][h_hash(v_i)] += g_hash(v_i) * v_i
    
    return sketch, hash_funcs

In [13]:
sketch, hash_funcs = ams_sketch([0.3, 0.4, 0.5, 1.11], 10, 4)

20829
11845
2609
13724


In [14]:
for i, (h_hash, g_hash) in enumerate(hash_funcs): # for each hash row
    print(h_hash[i*0.3])

TypeError: 'function' object is not subscriptable

# VSAM

In [180]:
# refactor

from math import sqrt
import numpy as np
from collections import Counter as sparse


class hash_family:
    def __init__(self, depth):
        # number of hash functions
        self.depth = depth
        
        # F[0] : a1 , F[1] : b1 , F[2] : a1 , F[3] : b1 , F[4] : a3 , F[5] : b3 
        self.F = np.random.randint(0, 1 << 63 - 1, size=(6, depth), dtype=np.int64)

    @staticmethod
    def hash31(a, b, x):
        r = a * x + b
        
        # int divide by 2^31 (Shift 31) + combine higher order bits with lower
        fold = ((r >> 31) ^ r)
        
        # 2147483647 = 0111...1
        return fold & 2147483647

    def hash(self, x):
        F = self.F
        return self.hash31(F[0], F[1], x)

    def fourwise(self, x):
        F = self.F
        return 2*(((self.hash31(self.hash31(self.hash31(x,F[2],F[3]),x,F[4]),x,F[5])) & 32768)>>15)-1

class sketch:
    def __init__(self, width, depth, hf):
        self.width = width
        self.depth = depth
        self.hf = hf
        self.vec = np.zeros((depth, width))

    def update(self, key, freq=1):
        pos = self.hf.hash(key) % self.width
        delta = self.hf.fourwise(key) * freq
        self.vec[range(self.depth), pos] += delta

    def inner(self, other):
        return np.median(np.einsum('ij,ij->i', self.vec, other.vec))


def make_stream(nkeys, length):
    return np.random.randint(nkeys, size=length)


def make_sparse(S):
    return sparse(S)


def sparse_inner(s1, s2):
    return sum(s1[k] * s2[k] for k in s1 if k in s2)


def create_sketch(width, depth, hf, sp):
    sk = sketch(width, depth, hf)
    for x in sp:
        sk.update(x, sp[x])
    return sk


def test_sketch_accuracy():
    width = 1500
    depth = 7

    S1 = make_stream(10000, 10000)
    S2 = make_stream(10000, 10000)

    sp1 = make_sparse(S1)
    sp2 = make_sparse(S2)

    hf = hash_family(depth)
    sk1 = create_sketch(width, depth, hf, sp1)
    sk2 = create_sketch(width, depth, hf, sp2)

    inner_product_true = sparse_inner(sp1, sp2)
    inner_product_estimated = sk1.inner(sk2)

    error = abs((inner_product_true - inner_product_estimated) / inner_product_true)
    accuracy = 4 / np.sqrt(width)
    
    print(f"Accuracy: {accuracy}")
    print(f"Error: {error}, True Inner Product: {inner_product_true}, Estimated Inner Product: {inner_product_estimated}")
    assert error < accuracy, "Accuracy not sufficient"
    


test_sketch_accuracy()

Accuracy: 0.10327955589886445
Error: 0.0086104513064133, True Inner Product: 10104, Estimated Inner Product: 10017.0


In [198]:
def create_sketch_for_vector(width, depth, hf, v):
    sk = sketch(width, depth, hf)
    for i, x in enumerate(v):
        sk.update(i, x)
    return sk


width = 1500
depth = 7

#v = make_stream(10000, 5000)
v = np.random.rand(1000) # rand vector 0..1 

hf = hash_family(depth)
sk = create_sketch_for_vector(width, depth, hf, v)

inner_product_true = np.inner(v, v)
inner_product_estimated = sk.inner(sk)

error = abs((inner_product_true - inner_product_estimated) / inner_product_true)
accuracy = 4 / np.sqrt(width)

print(f"Accuracy: {accuracy}")
print(f"Error: {error}, True Inner Product: {inner_product_true}, Estimated Inner Product: {inner_product_estimated}")
assert error < accuracy, "Accuracy not sufficient"

Accuracy: 0.10327955589886445
Error: 0.006632860733486576, True Inner Product: 319.988024655266, Estimated Inner Product: 317.8655886513441


In [187]:
v

array([8])

In [186]:
sk.vec

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0., -1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,
         0.,  0.]])

In [176]:
v = make_stream(10000, 1)
np.inner(v, v)

54213769

In [179]:
np.inner(v, v)

54213769

In [193]:
random_vector = np.random.rand(10)

In [194]:
random_vector

array([0.44613459, 0.85034146, 0.13618654, 0.11335239, 0.50935281,
       0.66621787, 0.377427  , 0.85301939, 0.66664649, 0.76869197])