In [1]:
from math import log2, prod
import tokenize
import io
from collections import Counter

In [48]:
def entropy(probs):
    return -sum( p*log2(p) for p in probs)

def expand_corpus(corpus_with_counts):
    corpus = []
    for k, v in corpus_with_counts.items():
        corpus += [k]*v
    return corpus

def counter_for_corpus(corpus, window_size):
    c = Counter()
    for s in corpus:
        window = []
        for token in tokenize.generate_tokens(io.StringIO(s).readline):
            if token.type not in [0,4]:
                window.append(token.string)
                if len(window) > window_size:
                    window.pop(0)
                if len(window) == window_size:
                    c.update(["@".join(window)])
    return c

def entropy_for_corpus(corpus_with_counts, window_size):
    c = counter_for_corpus(expand_corpus(corpus_with_counts), window_size)
    total = sum(v for k,v in c.items())
    return entropy(v/total for k, v in c.items())

In [75]:
import itertools

alphabet = 'abcdefghijklmnopqrstuvwxyz'
alphabet = ["".join(x) for x in itertools.product(alphabet, alphabet)]

def functions(n):
    result = []
    for i in range(prod(n)):
        result.append(alphabet[i]+"(df)")
    return result

def compact(values):
    result = []
    letters = [alphabet[i] for i in range(len(values))]
    for combo in itertools.product(*[list(range(i)) for i in values]):
        str = "plot(df"
        for letter, number in zip(letters, combo):
            if number > 0:
                str += f", {letter}='{letter}{number}'"
        str += ")"
        result.append(str)
                          
    return sorted(result, key=len)

def weighted(strings):
    return dict(zip(strings, reversed(list(range(1,len(strings)+1)))))

In [None]:
values = [2,2]
print(compact(values))
print(functions(values))

['plot(df)', "plot(df, 'ab1')", "plot(df, 'aa1')", "plot(df, 'aa1', 'ab1')"]
['aa(df)', 'ab(df)', 'ac(df)', 'ad(df)']


In [71]:
values = [10,10,2]
nl = "\n"
from gzip import compress
print(f"""
structured: {len(compress(nl.join(compact(values)).encode()))}
enumerated: {len(compress(nl.join(functions(values)).encode()))}
""")


structured: 484
enumerated: 346



In [52]:
values = [10,10]

a = weighted(functions(values))
b = weighted(compact(values))

print(f"""
token_count(functions)   {len(counter_for_corpus(a, 1))}
token_count(compact)     {len(counter_for_corpus(b, 1))}

entropy_for_corpus(functions, 1)  {entropy_for_corpus(a, 1):.3f}
entropy_for_corpus(compact, 1)    {entropy_for_corpus(b, 1):.3f}

entropy_for_corpus(functions, 2)  {entropy_for_corpus(a, 2):.3f}
entropy_for_corpus(compact, 2)    {entropy_for_corpus(b, 2):.3f}

entropy_for_corpus(functions, 3)  {entropy_for_corpus(a, 3):.3f}
entropy_for_corpus(compact, 3)    {entropy_for_corpus(b, 3):.3f}
""")


token_count(functions)   102
token_count(compact)     26

entropy_for_corpus(functions, 1)  3.585
entropy_for_corpus(compact, 1)    3.747

entropy_for_corpus(functions, 2)  3.709
entropy_for_corpus(compact, 2)    4.582

entropy_for_corpus(functions, 3)  4.186
entropy_for_corpus(compact, 3)    4.871

