In [None]:
# Personally I had to add the root folder of the repo to the sys.path.  If certain imports do not work you should uncomment and set the following.
# import sys
# sys.path.append('/root/of/repo/folder/')

# Hash Experiment

In this experiment we evaluate what kind of uniformity we can expect when using the built-in SipHash in Python.  To use the hash function as a load balancer we require it to uniformly distribute our requests accross different nodes.

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from palettable.colorbrewer.sequential import YlGnBu_9
colors = list(reversed(YlGnBu_9.mpl_colors))

import experiments.plotter.neat_plotter
from random import random

In [None]:
from experiments.utils import read_resource_map

resource_file = "../dataset/out/dataset-resources-stats.csv"
resource_map = read_resource_map(resource_file)
identifiers = list(resource_map.keys())

In [None]:
def take_fraction_of_random_subset(s, fraction=0.5):
    return take_random_subset(s, lambda x: random() < fraction)

def take_random_subset(s, condition=lambda x: random() < 0.5):
    return [ i for i in s if condition(i) ]

def hash_set(s):
    return [ i.__hash__() for i in s ]

def select_bin(identifier, no_bins=14):
    return identifier.__hash__() % no_bins

## Collisions
First we see if we can find any collisions between the hashes for our content.

In [None]:
hashes = {}
print("Testing for collisions...")
for identifier in identifiers:
    h = identifier.__hash__()
    if h in hashes:
        collision = True
    else:
        collision = False
        hashes[h] = set()
        hashes[h].add(identifier)
    if collision and len(hashes[h]) > 1:
        print(f"{h}: {hashes[h]}")
print("Done")

## Distribution
We then generate a number of subsets that we will then apply the hash function over, by taking subsets with a smaller fraction we can evaluate whether smaller subsets will have a less uniform distribution.

In [None]:
no_subsets = 8
no_bins = 14
subsets = []
hashed_subsets = []
for i in range(no_subsets):
    subset = take_fraction_of_random_subset(identifiers, fraction=(i + 1)/no_subsets)
    hashed_subset = hash_set(subset)
    subsets.append(subset)
    hashed_subsets.append(hashed_subset)

We then plot these subsets in bins using a histogram.

In [None]:
fig, axs = plt.subplots(1, 1, tight_layout=True, figsize=(14,4))
axs.xaxis.grid(which='both', linestyle='dashed')
for idx, s in enumerate(reversed(hashed_subsets)):
    _, bins, _ = axs.hist(s, bins=no_bins, label=f"{(no_subsets - idx)/no_subsets * 100}%", color=colors[idx])

axs.set_xticks(bins, minor=True)
axs.set_xticklabels([""] * len(bins), minor=True)
axs.set_xticks([bins[0], 0, bins[-1]])

axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(hashes)))

axs.legend(title="Subset Size", loc='lower right')
axs.set_title("Distribution of hashed item identifiers.")
axs.set_ylabel("Percentage of items in bin")
axs.set_xlabel("Hashed Value")

## Hash Byte Distribution

We then take a look at the distribution of the bytes instead to see if this also follows a uniform distribution.  This is not guaranteed by the hash function so we expect some difference.

In [None]:
subset_bytes = []

for subset in subsets:
    byte_distribution = [0] * no_bins
    for resource in subset:
        byte_distribution[select_bin(resource, no_bins)] += resource_map.get(resource, 0)
    subset_bytes.append(byte_distribution)

In [None]:
fig, axs = plt.subplots(1, 1, tight_layout=True, figsize=(14,4), dpi=300)
# axs.xaxis.grid(which='both', linestyle='dashed')
for idx, s in enumerate(reversed(subset_bytes)):
    bars = axs.bar(range(no_bins), s, width=1, label=f"{(no_subsets - idx)/no_subsets * 100}%", color=colors[idx])
    for bar in bars:
        bar.set_edgecolor('#EEEEEE')
        bar.set_linewidth(1)

# axs.set_xticks(bins, minor=False)
# axs.set_xticklabels([""] * len(bins), minor=True)
axs.set_xticks(range(no_bins))

# axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(hashes)))

axs.legend(title="Subset Size", loc='lower right')
axs.set_title("Distribution of bytes by hashed identifiers.")
axs.set_ylabel("Bytes assigned to bin")
axs.set_xlabel("Bin Identifier")