# Encoding Comparison Generation

This notebook generates the dataset for a side-by-side evaluation of different bit-to-tree encodings

In [1]:
import sys
sys.path.append('../')

import numpy as np
import math
import plotly.express as px
import pandas as pd
from func_timeout import func_timeout, FunctionTimedOut
import traceback

from tree_lib.tree import TreeNode, tower
from tree_lib.encodings import two_choices, two_choices_one_two, counting_ones, counting_ones_leaves, fibonaccio, tuples, square, cube, entropy, power_digits, power_digits_2, gauss, cantor2d, simplex, one_child, abe1993
import tree_lib.util as util

#### 1 - Define encodings to compare

In [2]:

ENCODINGS_UNFILTERED = [
    #### Display  | bits to tree | tree to bits | max bits encodable (None if infinite)
    # ["n*(n+1)/2", gauss.bits_to_tree, gauss.tree_to_bits,None],
    # ["two_choices", two_choices.bits_to_tree, two_choices.tree_to_bits,None],
    # ["two_choices_one_two", two_choices_one_two.bits_to_tree, two_choices_one_two.tree_to_bits,None],
    # ["two_choices_wrap", two_choices.bits_to_tree_wrap, None,None],
    # ["two_choices_mark", two_choices.bits_to_tree_mark, None,None],
    # ["two_choices_k=9", lambda s: two_choices.bits_to_tree_k(s, 9), None,None],
    # ["counting_ones_leaves", counting_ones_leaves.bits_to_tree, counting_ones_leaves.tree_to_bits,None],
    # ["fibonaccio", fibonaccio.bits_to_tree, fibonaccio.tree_to_bits,None],
    # ["power_digits", power_digits.bits_to_tree, power_digits.tree_to_bits,None],
    # ["power_digits_2", power_digits_2.bits_to_tree, power_digits_2.tree_to_bits,None],
    # ["tuples", tuples.bits_to_tree, tuples.tree_to_bits,None],
    ["Y.Abe, 1994", abe1993.bits_to_tree, abe1993.tree_to_bits,31],
    ["Squares", square.bits_to_tree, square.tree_to_bits,None],
    ["Cubes", cube.bits_to_tree, cube.tree_to_bits,None],
    # ["entropy", entropy.bits_to_tree, entropy.tree_to_bits,None],
    # ["cantor2d", cantor2d.bits_to_tree, cantor2d.tree_to_bits,None],
    # ["cantor2d_snake", lambda s: cantor2d.bits_to_tree(s, snake=True), lambda t: cantor2d.tree_to_bits(t, snake=True),None],
    # ["simplex2d", lambda s: simplex.bits_to_tree(s, 2), lambda t: simplex.tree_to_bits(t, 2),None],
    # ["simplex3d", lambda s: simplex.bits_to_tree(s, 3), lambda t: simplex.tree_to_bits(t, 3),None],
    # ["one_child", one_child.bits_to_tree, one_child.tree_to_bits,None],
    # ["simplex4d", lambda s: simplex.bits_to_tree(s, 4), lambda t: simplex.tree_to_bits(t, 4),None],
    # ["simplex5d", lambda s: simplex.bits_to_tree(s, 5), lambda t: simplex.tree_to_bits(t, 5),None],
    # ["simplex10d", lambda s: simplex.bits_to_tree(s, 10), lambda t: simplex.tree_to_bits(t, 10),None],
    ]

#### 2. Check for correctness

In [3]:


from func_timeout import func_timeout, FunctionTimedOut

BASIC_STRINGS = [
    "0","1","01","10","11", "100", "101", "110", "111",     # basic
    "010101010101", "0000000000", "1111111111",             # something more complex
]
LONG_STRINGS = [util.gen_bit_string(i) for i in range(1, 400)]

def string_test(strings, name, btt, ttb, max_bits, shuffle=False):
    def encode_decode(s):
        t = btt(s)
        if t is not None and shuffle:
            util.shuffle_tree(t)
        return ttb(t)
    
    # Filter out strings for encodings that have limited capabilities
    if max_bits is not None:
        orig_strings_len = len(strings)
        strings = [s for s in strings if len(s) < max_bits]
        print(f"{name}: Keeping only {len(strings)}/{orig_strings_len} strings -- length <= {max_bits}")

    try:
        if ttb is None:
            # If there is no ttb, try to encode strings only
            res = [btt(s) for s in strings]
            return f"🤷️️️️️️"
        else:
            errors = func_timeout(10., lambda: [s for s in strings if encode_decode(s) != s])
            return "✅" if len(errors)==0 else f"🐖️️️️️️ {errors[0]}"
    except FunctionTimedOut:
        return f"⏰"
    except Exception as e:
        print(traceback.format_exc())
        return f"🧨️️️️️️ {e}"

results = [[name, 
            string_test(BASIC_STRINGS, name, btt, ttb, max_bits), 
            string_test(LONG_STRINGS, name, btt, ttb, max_bits),
            string_test(BASIC_STRINGS+LONG_STRINGS, name, btt, ttb, max_bits, True),] 
            for name, btt, ttb, max_bits in ENCODINGS_UNFILTERED]

df = pd.DataFrame(results, columns=['Name', 'Basic Test', "Long Strings", "Unordered"])

# Discard encodings that do not work
def is_acceptable_result(test_basic,test_long,test_unordered):
    return ("✅" in test_basic or "🤷️️️️️" in test_basic) and \
           ("✅" in test_long or "🤷️️️️️" in test_long)

to_discard = [name for [name, t1, t2, t3] in results if not is_acceptable_result(t1,t2,t3)]
if to_discard:
    print(f"\nDiscarding {to_discard}")
ENCODINGS = [e for e in ENCODINGS_UNFILTERED if e[0] not in to_discard]

df

Y.Abe, 1994: Keeping only 12/12 strings -- length <= 31
Y.Abe, 1994: Keeping only 30/399 strings -- length <= 31
Y.Abe, 1994: Keeping only 42/411 strings -- length <= 31


Unnamed: 0,Name,Basic Test,Long Strings,Unordered
0,"Y.Abe, 1994",✅,✅,✅
1,Squares,✅,✅,✅
2,Cubes,✅,✅,✅


#### 3. Generate dataset


In [4]:
BIT_STRING_LENGTHS = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
PROBABILITY_OF_ONE = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
AUTOCORRELATION = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
SAMPLES_PER_STRING_TYPE = 700
SAMPLES_PER_STRING_TYPE_AC = 300

# Build generic random data set with random string. 
dataset_generic = [util.gen_bit_string(length, prob_of_one) 
                        for length in BIT_STRING_LENGTHS 
                        for prob_of_one in PROBABILITY_OF_ONE
                        for _ in range(SAMPLES_PER_STRING_TYPE)]
print(len(dataset_generic))
# Inject some edge cases (all zeros, all ones)
dataset_generic += [util.gen_bit_string(length, prob_of_one) 
                        for length in BIT_STRING_LENGTHS 
                        for prob_of_one in [0., 1.]]    

print(len(dataset_generic))

dataset_generic += [util.gen_bit_string_with_autocorrelation(length, ac) 
                        for length in BIT_STRING_LENGTHS 
                        for ac in AUTOCORRELATION
                        for _ in range(SAMPLES_PER_STRING_TYPE_AC)]              
print(len(dataset_generic))

def dataset_for_encoding (encoding_name, bit_to_tree_fun, max_bits): 
    filtered_dataset = dataset_generic
    if max_bits is not None:
        orig_strings_len = len(filtered_dataset)
        filtered_dataset = [s for s in filtered_dataset if len(s) < max_bits]
        print(f"{encoding_name}: Keeping only {len(filtered_dataset)}/{orig_strings_len} strings -- length <= {max_bits}")

    # Generate random bit strings with various properties
    # Given a bit string, generate a data point
    # [encoding name, length, num nodes]
    def data_point(bit_str):
        tree = bit_to_tree_fun(bit_str)
        str_len = len(bit_str)
        perc_of_ones = bit_str.count('1') / str_len
        autocorrelation = util.autocorrelation(bit_str)
        return [encoding_name, str_len, perc_of_ones, autocorrelation, tree.n_descendants/str_len, tree.get_total_footprint()]
    
    return [data_point(bit_str) for bit_str in filtered_dataset]

dataset = []
for encoding_name, btt, _, max_bits in ENCODINGS:
    dataset += dataset_for_encoding(encoding_name, btt, max_bits)

df = pd.DataFrame(dataset, columns=['Encoding', 'Bits', 'PercOfOnes', 'Autocorrelation', 'Nodes/Bit', 'Footprint']) 
df


63000
63020
90020
Y.Abe, 1994: Keeping only 27006/90020 strings -- length <= 31


Unnamed: 0,Encoding,Bits,PercOfOnes,Autocorrelation,Nodes/Bit,Footprint
0,"Y.Abe, 1994",10,0.10,0.80,1.10,28
1,"Y.Abe, 1994",10,0.00,1.00,1.10,21
2,"Y.Abe, 1994",10,0.10,0.90,1.20,24
3,"Y.Abe, 1994",10,0.00,1.00,1.10,21
4,"Y.Abe, 1994",10,0.10,0.90,1.20,37
...,...,...,...,...,...,...
207041,Cubes,100,0.72,0.92,1.63,620
207042,Cubes,100,0.79,0.93,1.62,619
207043,Cubes,100,0.42,0.89,1.86,704
207044,Cubes,100,0.43,0.92,1.99,746


## Save to CSV

In [5]:
df.to_csv("encoding_comparison.csv")