# Encoding comparison

This notebook contains a side-by-side evaluation of different bit-to-tree encodings

In [1]:
import sys
sys.path.append('../')
sys.setrecursionlimit(10000) # Croissant

import numpy as np
import math
import plotly.express as px
import pandas as pd
from func_timeout import func_timeout, FunctionTimedOut
import traceback

from tree_lib.tree import TreeNode
from tree_lib.encodings import two_choices, two_choices_one_two, counting_ones, counting_ones_leaves, fibonaccio, tuples, square, cube, entropy, power_digits, power_digits_2
import tree_lib.util as util

#### 1 - Define encodings to compare

In [2]:
ENCODINGS_UNFILTERED = [
    ["two_choices", two_choices.bits_to_tree, two_choices.tree_to_bits],
    # ["two_choices_one_two", two_choices_one_two.bits_to_tree, two_choices_one_two.tree_to_bits],
    ["two_choices_wrap", two_choices.bits_to_tree_wrap, None],
    ["two_choices_mark", two_choices.bits_to_tree_mark, None],
    # ["two_choices_k=9", lambda s: two_choices.bits_to_tree_k(s, 9), None],
    # ["counting_ones_leaves", counting_ones_leaves.bits_to_tree, counting_ones_leaves.tree_to_bits],
    # ["fibonaccio", fibonaccio.bits_to_tree, fibonaccio.tree_to_bits],
    # ["power_digits", power_digits.bits_to_tree, power_digits.tree_to_bits],
    # ["power_digits_2", power_digits_2.bits_to_tree, power_digits_2.tree_to_bits],
    ["tuples", tuples.bits_to_tree, tuples.tree_to_bits],
    ["square", square.bits_to_tree, square.tree_to_bits],
    ["cube", cube.bits_to_tree, cube.tree_to_bits],
    ["entropy", entropy.bits_to_tree, entropy.tree_to_bits],
    ]

#### 2. Check for correctness

In [3]:

from func_timeout import func_timeout, FunctionTimedOut

BASIC_STRINGS = [
    "0","1","01","10","11", "100", "101", "110", "111",     # basic
    "010101010101", "0000000000", "1111111111",             # something more complex
]
LONG_STRINGS = [util.gen_bit_string(300) for _ in range(300)]

def string_test(strings, name, btt, ttb, shuffle=False):
    def encode_decode(s):
        t = btt(s)
        if t is not None and shuffle:
            util.shuffle_tree(t)
        return ttb(t)

    try:
        if ttb is None:
            # If there is no ttb, try to encode strings only
            res = [btt(s) for s in strings]
            return f"🤷️️️️️️"
        else:
            errors = func_timeout(5., lambda: [s for s in strings if encode_decode(s) != s])
            return "✅" if len(errors)==0 else f"🐖️️️️️️ {errors[0]}"
    except FunctionTimedOut:
        return f"⏰"
    except Exception as e:
        print(traceback.format_exc())
        return f"🧨️️️️️️ {e}"

results = [[name, 
            string_test(BASIC_STRINGS, name, btt, ttb), 
            string_test(LONG_STRINGS, name, btt, ttb),
            string_test(BASIC_STRINGS+LONG_STRINGS, name, btt, ttb, True),] 
            for name, btt, ttb in ENCODINGS_UNFILTERED]

df = pd.DataFrame(results, columns=['Name', 'Basic Test', "Long Strings", "Unordered"])

# Discard encodings that do not work
def is_acceptable_result(test_basic,test_long,test_unordered):
    return ("✅" in test_basic or "🤷️️️️️" in test_basic) and \
           ("✅" in test_long or "🤷️️️️️" in test_long)

to_discard = [name for [name, t1, t2, t3] in results if not is_acceptable_result(t1,t2,t3)]
if to_discard:
    print(f"\nDiscarding {to_discard}")
ENCODINGS = [e for e in ENCODINGS_UNFILTERED if e[0] not in to_discard]

df



Discarding ['power_digits']


Unnamed: 0,Name,Basic Test,Long Strings,Unordered
0,two_choices,✅,✅,🐖️️️️️️ 0
1,two_choices_wrap,🤷️️️️️️,🤷️️️️️️,🤷️️️️️️
2,two_choices_mark,🤷️️️️️️,🤷️️️️️️,🤷️️️️️️
3,power_digits,✅,⏰,⏰
4,power_digits_2,✅,✅,✅
5,tuples,✅,✅,✅
6,square,✅,✅,✅
7,cube,✅,✅,✅
8,entropy,✅,✅,✅


#### 3. Generate dataset


In [4]:
BIT_STRING_LENGTHS = [100, 300, 500, 700, 900, 1100]
PROBABILITY_OF_ONE = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
AUTOCORRELATION = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
SAMPLES_PER_STRING_TYPE = 10
SAMPLES_PER_STRING_TYPE_AC = 3

# Build generic random data set with random string. 
dataset_generic = [util.gen_bit_string(length, prob_of_one) 
                        for length in BIT_STRING_LENGTHS 
                        for prob_of_one in PROBABILITY_OF_ONE
                        for _ in range(SAMPLES_PER_STRING_TYPE)]

# Inject some edge cases (all zeros, all ones)
dataset_generic += [util.gen_bit_string(length, prob_of_one) 
                        for length in BIT_STRING_LENGTHS 
                        for prob_of_one in [0., 1.]]    


dataset_generic += [util.gen_bit_string_with_autocorrelation(length, ac) 
                        for length in BIT_STRING_LENGTHS 
                        for ac in AUTOCORRELATION
                        for _ in range(SAMPLES_PER_STRING_TYPE_AC)]              

def dataset_for_encoding (encoding_name, bit_to_tree_fun): 
    # Generate random bit strings with various properties
    # Given a bit string, generate a data point
    # [encoding name, length, num nodes]
    def data_point(bit_str):
        tree = bit_to_tree_fun(bit_str)
        str_len = len(bit_str)
        perc_of_ones = bit_str.count('1') / str_len
        autocorrelation = util.autocorrelation(bit_str)
        return [encoding_name, str_len, perc_of_ones, autocorrelation, tree.n_descendants, tree.weight]
    
    return [data_point(bit_str) for bit_str in dataset_generic]

dataset = []
for encoding_name, btt, _ in ENCODINGS:
    dataset += dataset_for_encoding(encoding_name, btt)

df = pd.DataFrame(dataset, columns=['Encoding', 'Bits', 'PercOfOnes', 'Autocorrelation', 'Nodes', 'Weight']) 
df


Unnamed: 0,Encoding,Bits,PercOfOnes,Autocorrelation,Nodes,Weight
0,two_choices,100,0.100000,0.820000,98,1179
1,two_choices,100,0.060000,0.900000,104,1329
2,two_choices,100,0.070000,0.880000,100,1276
3,two_choices,100,0.060000,0.880000,101,1277
4,two_choices,100,0.130000,0.790000,98,1089
...,...,...,...,...,...,...
5707,entropy,1100,0.456364,0.800000,1603,607153
5708,entropy,1100,0.465455,0.776364,1613,607163
5709,entropy,1100,0.545455,0.902727,1701,607251
5710,entropy,1100,0.540000,0.884545,1695,607245


#### 4. Reports

In [5]:
fig = px.box(df, x="Bits", y="Nodes", color="Encoding", 
             title="Bits vs Number Nodes, Entire Dataset")
fig.show()

fig = px.box(df, x="Bits", y="Weight", color="Encoding", 
             title="Weight vs Number Nodes, Entire Dataset")
fig.show()

df_500 = df[df['Bits'] == 500] 
fig = px.scatter(df_500, x="PercOfOnes", y="Nodes", color="Encoding", 
             title="Percentage of Ones vs Num Nodes, 500 Bits")
fig.show()

fig = px.scatter(df_500, x="Autocorrelation", y="Nodes", color="Encoding", 
             title="Autocorrelation vs Num Nodes, 500 Bits")
fig.show()


#### [EXPERIMENTAL] Check optimality

In [6]:
GEN_TREE_MAX_NODES = 20

decode_data = []
for encoding_name, _, ttb in ENCODINGS:
    decode_data.append({
        "encoding_name": encoding_name,
        "decode_fun": ttb,
        "decoded_strings": set(), # Hashumapo
        "failed_decodings": 0,
        "duplicate_decodings": 0,
        "attempted_decodings": 0,
    })

def try_decode(root, decode_data):
    for data in decode_data:
        data["attempted_decodings"] += 1
        try:
            decoded_string = data["decode_fun"](root)

            if decoded_string in data["decoded_strings"]:
                data["duplicate_decodings"] += 1
            data["decoded_strings"].add(decoded_string)
        except:
            data["failed_decodings"] += 1

def trees_with_n_nodes(n):
    def catalan_number(n):
        return math.comb(n*2, n) // (n + 1)
    return sum((catalan_number(i) for i in range(n+1)))


In [7]:
# def decode_all_trees(target_count, decode_data):
#     root = TreeNode()
#     n_tree_generated = 0

#     def gen_tree(node, curr_node_count):
#         nonlocal n_tree_generated
#         n_tree_generated += 1

#         try_decode(root, decode_data) # Try all decoding strategies

#         # Max possible children
#         new_children = []
#         for i in range(target_count - curr_node_count):
#             new_child = TreeNode()
#             new_children.append(new_child)
#             node.children = new_children
#             gen_tree(new_child, curr_node_count + i + 1)

#         node.children = [] # Pop all children to backtrack

#     gen_tree(root, 0)
#     # TODO Fix algorithm
#     # assert n_tree_generated == trees_with_n_nodes(target_count)

# decode_all_trees(GEN_TREE_MAX_NODES, decode_data)

# df_dec = pd.DataFrame(decode_data, columns=["encoding_name", "failed_decodings", "duplicate_decodings", "attempted_decodings"])
# df_dec["duplicate_rate"] = df_dec["duplicate_decodings"] / df_dec["attempted_decodings"]
# df_dec["failure_rate"] = df_dec["failed_decodings"] / df_dec["attempted_decodings"]
# df_dec
