## Set Up Dependencies and Data


In [1]:
import random

import more_itertools as mit

import joblib
import pandas as pd
from pylib._make_hamming_distance_matrix import make_hamming_distance_matrix


In [2]:
df = pd.read_csv("https://osf.io/mgky2/download")


## Reproducibility


In [3]:
%load_ext watermark
%watermark -iwbmuvg -iv


Last updated: 2024-10-06T19:29:49.937717+00:00

Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.20.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1014-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

Git hash: 632aac23ecbd6f15f354b00da8a9a3552b367e45

Git branch: HEAD

pandas        : 1.5.3
joblib        : 1.3.2
more_itertools: 10.2.0

Watermark: 2.4.3



In [4]:
df.head()


Unnamed: 0,bitfield
0,4124697448505068214874120741
1,51785361206533278547496676558
2,78708697932479583268160285174
3,43738926030594400371889261476
4,28264638053419403218638195684


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   bitfield  9 non-null      object
dtypes: object(1)
memory usage: 204.0+ bytes


In [6]:
df.describe()


Unnamed: 0,bitfield
count,9
unique,9
top,4124697448505068214874120741
freq,1


In [7]:
joblib.hash(df)


'898222e7af51ac0b533ce339b8929eff'

## Data Prep


In [8]:
df["bitfield"] = df["bitfield"].apply(int)
df["bitfield value bitlengths"] = df["bitfield"].apply(int.bit_length)
df["bitfield wordlengths"] = (df["bitfield value bitlengths"] + 31) // 32
assert mit.one(df["bitfield wordlengths"].unique()) == 3
df["bitfield bitlengths"] = df["bitfield wordlengths"] * 32
df["driftbit bitlengths"] = df["bitfield bitlengths"] - 16
df


Unnamed: 0,bitfield,bitfield value bitlengths,bitfield wordlengths,bitfield bitlengths,driftbit bitlengths
0,4124697448505068214874120741,92,3,96,80
1,51785361206533278547496676558,96,3,96,80
2,78708697932479583268160285174,96,3,96,80
3,43738926030594400371889261476,96,3,96,80
4,28264638053419403218638195684,95,3,96,80
5,18979682958906860473440177007,94,3,96,80
6,54260633048023384810208253716,96,3,96,80
7,30740298850804297830921279380,95,3,96,80
8,11551323908690312484395030486,94,3,96,80


In [9]:
exclude_leading = 16
bitfield_bitlength = int(mit.one(df["bitfield bitlengths"].unique()))
driftbit_mask = (  # mask off leading 16 bit
    1 << (bitfield_bitlength - exclude_leading)
) - 1
assert driftbit_mask.bit_count() == bitfield_bitlength - exclude_leading
df["bitfield driftbits"] = df["bitfield"].values & driftbit_mask

df


Unnamed: 0,bitfield,bitfield value bitlengths,bitfield wordlengths,bitfield bitlengths,driftbit bitlengths,bitfield driftbits
0,4124697448505068214874120741,92,3,96,80,1051477799568099951354405
1,51785361206533278547496676558,96,3,96,80,1023723340637848957627598
2,78708697932479583268160285174,96,3,96,80,373520649536219739990518
3,43738926030594400371889261476,96,3,96,80,1198802756731460194519972
4,28264638053419403218638195684,95,3,96,80,1161316648987743182506980
5,18979682958906860473440177007,94,3,96,80,756516776797059727919983
6,54260633048023384810208253716,96,3,96,80,415486259983561870956308
7,30740298850804297830921279380,95,3,96,80,942035463121805667342228
8,11551323908690312484395030486,94,3,96,80,37702272530720077518806


In [10]:
driftbit_bitlength = int(mit.one(df["driftbit bitlengths"].unique()))
driftbit_quotient = (1 << (driftbit_bitlength // 2)) >> 1
df["lower driftbits"] = df["bitfield driftbits"] % driftbit_quotient
df["upper driftbits"] = df["bitfield driftbits"] // driftbit_quotient

df


Unnamed: 0,bitfield,bitfield value bitlengths,bitfield wordlengths,bitfield bitlengths,driftbit bitlengths,bitfield driftbits,lower driftbits,upper driftbits
0,4124697448505068214874120741,92,3,96,80,1051477799568099951354405,48243515941,1912626975478
1,51785361206533278547496676558,96,3,96,80,1023723340637848957627598,446692863182,1862141908782
2,78708697932479583268160285174,96,3,96,80,373520649536219739990518,521848629750,679430103511
3,43738926030594400371889261476,96,3,96,80,1198802756731460194519972,47970896804,2180609511436
4,28264638053419403218638195684,95,3,96,80,1161316648987743182506980,48235138020,2112422678670
5,18979682958906860473440177007,94,3,96,80,756516776797059727919983,284503214959,1376096000598
6,54260633048023384810208253716,96,3,96,80,415486259983561870956308,548581037844,755765104228
7,30740298850804297830921279380,95,3,96,80,942035463121805667342228,537642014612,1713552525182
8,11551323908690312484395030486,94,3,96,80,37702272530720077518806,537574643670,68580034222


In [11]:
df.dtypes


bitfield                     object
bitfield value bitlengths     int64
bitfield wordlengths          int64
bitfield bitlengths           int64
driftbit bitlengths           int64
bitfield driftbits           object
lower driftbits              object
upper driftbits              object
dtype: object

## Bitdrift simulation tree


In [12]:
print(
    make_hamming_distance_matrix(df["bitfield driftbits"])
    .upgma_tree()
    .as_ascii_plot(plot_metric="length")
)


     /----------------------------------------------------------------------- 5
/----+                                                                         
|    |                  /---------------------------------------------------- 0
|    \------------------+                                                      
|                       |                                /------------------- 4
+                       \--------------------------------+                     
|                                                        \------------------- 3
|                                                                              
|---------------------------------------------------------------------------- 1
+                                                                              
|        /------------------------------------------------------------------- 2
\--------+                                                                     
         |                           /--

## Bitdrift simulation tree (first 40 bits)


In [13]:
print(
    make_hamming_distance_matrix(df["lower driftbits"])
    .upgma_tree()
    .as_ascii_plot(plot_metric="length")
)


                                                /---------------------------- 6
             /----------------------------------+                              
             |                                  |             /-------------- 7
             |                                  \-------------+                
/------------+                                                \-------------- 8
|            |                                                                 
|            |                       /--------------------------------------- 2
+            \-----------------------+                                         
|                                    \--------------------------------------- 1
|                                                                              
|          /----------------------------------------------------------------- 5
\----------+                                                                   
           |                            

## Bitdrift simulation tree (last 40 bits)


In [14]:
print(
    make_hamming_distance_matrix(df["upper driftbits"])
    .upgma_tree()
    .as_ascii_plot(plot_metric="length")
)


    /------------------------------------------------------------------------ 5
/---+                                                                          
|   |             /---------------------------------------------------------- 0
|   \-------------+                                                            
|                 \---------------------------------------------------------- 2
+                                                                              
|                          /------------------------------------------------- 6
|  /-----------------------+                                                   
|  |                       |             /----------------------------------- 7
|  |                       \-------------+                                     
\--+                                     \----------------------------------- 8
   |                                                                           
   |        /---------------------------

## Random tree


In [15]:
dummy = [random.randint(0, 2**80) for _ in range(9)]
print(make_hamming_distance_matrix(dummy).upgma_tree().as_ascii_plot())


/---------------------------------------------------------------------------- 0
|                                                                              
|                                             /------------------------------ 1
+              /------------------------------+                                
|              |                              |              /--------------- 3
|              |                              \--------------+                 
|              |                                             \--------------- 6
\--------------+                                                               
               |                                             /--------------- 2
               |              /------------------------------+                 
               |              |                              \--------------- 7
               \--------------+                                                
                              |         