In [176]:
from collections import Counter
import numpy as np
import sys
import json
import ast
import random

In [83]:
# The JSON holds tuples
def build_lookup_table(filename):
    with open(filename, 'r') as fp:
        lookup = json.load(fp)

    return lookup

In [93]:
def array_to_tuple(matrix):
    try:
        return tuple(array_to_tuple(i) for i in matrix)
    except TypeError:
        return tuple(matrix)

In [123]:
def bdm_1d(string, lookup, d=10, verbose=False):
    
    if(len(string)<d):
        return lookup[string]
    else:
        iterations = int(len(string)/d)
        
        
        strings = []
        for i in range(iterations):
            substring = string[(i)*d:(i+1)*d]
            #substring_tup = array_to_tuple([int(s) for s in substring])
            
            substring_tup = "("+', '.join(list(substring))+")"
            
            strings.append(substring_tup)
            
            if(verbose):
                print("Substring: ", substring_tup)
                
        # ignore boundary condition
        # substring = string[(i+1)*d-1:len(string)-1]
        
        #if(substring):
        #    substring_tup = array_to_tuple([int(s) for s in substring])
        #    strings.append(substring_tup)
        
        #if(verbose):
        #        print("Substring: ", substring_tup)
            
    counts = Counter(strings)
    bdm_value = sum(lookup[string] + np.log2(n) for string, n in counts.items())
    
    return bdm_value

In [229]:
lookup = build_lookup_table('K-10.json')

In [302]:
def convert_num_to_binary(num):
    #works only for positive numbers
    return str(bin(num))[2:]

In [303]:
convert_num_to_binary(50)

'110010'

In [324]:
#block to generate fibonacci sequence

base2_st = ''
base10_st = ''
a=0
b=1
for i in range(30):
    old_a = a
    num = a+b
    base10_st += str(num)
    base2_st += str(convert_num_to_binary(num))
    a=b
    b=old_a+b
    
    

In [320]:
#block to generate trivial case of 00000s

base2_st = ''
base10_st = ''
for i in range(100):
    num = 0
    base10_st += str(num)
    base2_st += str(convert_num_to_binary(num))

In [318]:
#function to return random sequence of numbers

def return_random_sequence(length):

    base2_randst = ''
    base10_randst = ''
    for j in range(length):
        num = random.randint(0,9)
        base10_randst += str(num)
    
        base2_randst += str(convert_num_to_binary(num))
        
    return base2_randst, base10_randst

In [325]:
print(base10_st, base2_st, round(bdm_1d(base2_st, lookup, d=10),2), len(gzip_str(base2_st)))

# do 100 runs, and take average

bdm_total = 0.0
gzip_total = 0
for i in range(100):

    base2_randst, base10_randst = return_random_sequence(len(base10_st))
    bdm_total += bdm_1d(base2_randst, lookup, d=10)
    gzip_total += len(gzip_str(base2_randst))

print("Random (average of 100 runs of same length - BDM: ", round(bdm_total/100, 2), " GZIP: ", round(gzip_total/100, 2))


123581321345589144233377610987159725844181676510946177112865746368750251213931964183178115142298320401346269 1101110110001101101011000101101111011001100100001110100110111100110011000101111011011110001111011010000110001000001010101110100110110110101011000010100010100101111110111111110001101101010010000010010010100010001111011010001100011011111111010000101001101100101110011111110110001011010111001011001000101000101001000101011011101 843.53 101
Random (average of 100 runs of same length - BDM:  734.75  GZIP:  89.45


In [260]:
import codecs

In [263]:
import gzip
import io

def gzip_str(string_):
    out = io.BytesIO()

    with gzip.GzipFile(fileobj=out, mode='w') as fo:
        fo.write(string_.encode())

    bytes_obj = out.getvalue()
    return bytes_obj

def gunzip_bytes_obj(bytes_obj):
    in_ = io.BytesIO()
    in_.write(bytes_obj)
    in_.seek(0)
    with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
        gunzipped_bytes_obj = fo.read()

    return gunzipped_bytes_obj.decode()
    
    
string_ = 'hello there!'

gzipped_bytes = gzip_str(string_)

original_string = gunzip_bytes_obj(gzipped_bytes)

assert string_ == original_string