## Set Up Dependencies and Data


In [None]:
import random

import more_itertools as mit

import joblib
import pandas as pd
from pylib._make_hamming_distance_matrix import make_hamming_distance_matrix


In [None]:
df = pd.read_csv("https://osf.io/mgky2/download")


## Reproducibility


In [None]:
%load_ext watermark
%watermark -iwbmuvg -iv


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
joblib.hash(df)


## Data Prep


In [None]:
df["bitfield"] = df["bitfield"].apply(int)
df["bitfield value bitlengths"] = df["bitfield"].apply(int.bit_length)
df["bitfield wordlengths"] = (df["bitfield value bitlengths"] + 31) // 32
assert mit.one(df["bitfield wordlengths"].unique()) == 3
df["bitfield bitlengths"] = df["bitfield wordlengths"] * 32
df["driftbit bitlengths"] = df["bitfield bitlengths"] - 16
df


In [None]:
exclude_leading = 16
bitfield_bitlength = int(mit.one(df["bitfield bitlengths"].unique()))
driftbit_mask = (  # mask off leading 16 bits
    1 << (bitfield_bitlength - exclude_leading)
) - 1
assert driftbit_mask.bit_count() == bitfield_bitlength - exclude_leading
df["bitfield driftbits"] = df["bitfield"].values & driftbit_mask

df


In [None]:
driftbit_bitlength = int(mit.one(df["driftbit bitlengths"].unique()))
driftbit_quotient = (1 << (driftbit_bitlength // 2)) >> 1
df["lower driftbits"] = df["bitfield driftbits"] % driftbit_quotient
df["upper driftbits"] = df["bitfield driftbits"] // driftbit_quotient

df


In [None]:
df.dtypes


## Bitdrift simulation tree


In [None]:
print(
    make_hamming_distance_matrix(df["bitfield driftbits"])
    .upgma_tree()
    .as_ascii_plot(plot_metric="length")
)


## Bitdrift simulation tree (first 40 bits)


In [None]:
print(
    make_hamming_distance_matrix(df["lower driftbits"])
    .upgma_tree()
    .as_ascii_plot(plot_metric="length")
)


## Bitdrift simulation tree (last 40 bits)


In [None]:
print(
    make_hamming_distance_matrix(df["upper driftbits"])
    .upgma_tree()
    .as_ascii_plot(plot_metric="length")
)


## Random tree


In [None]:
dummy = [random.randint(0, 2**80) for _ in range(9)]
print(make_hamming_distance_matrix(dummy).upgma_tree().as_ascii_plot())
