# Huffman codes

In [1]:
from heapq import heappush, heappop
from bitarray import bitarray
import time
import os

## Algorithm for generating Huffman's code dictionary

In [2]:
def huffman(letter_counts):
    class Node:
        def __init__(self, weight, left=None, right=None, letter=None):
            self.weight = weight
            self.left = left
            self.right = right
            self.letter = letter

        def __gt__(self, other):
            return self.weight > other.weight

        def __ge__(self, other):
            return self.weight >= other.weight

    letter_counts.sort(key=lambda l: l[0])
    result = []
    for letter, weight in letter_counts:
        heappush(result, Node(weight, letter=letter))
    while len(result) > 1:
        element_1 = heappop(result)
        element_2 = heappop(result)
        heappush(
            result, Node(element_1.weight + element_2.weight, element_1, element_2)
        )

    result_dict = {}

    def rec(node, code):
        nonlocal result_dict
        if node.letter is not None:
            result_dict[node.letter] = code
        else:
            lcode, rcode = code.copy(), code.copy()
            lcode.append(0)
            rcode.append(1)
            rec(node.left, lcode)
            rec(node.right, rcode)

    empty_code = bitarray()
    rec(result[0], empty_code)

    return result_dict, result[0]

## Algorithm utilising Huffman's code dictionary to encode a file

In [3]:
def encode(string):
    no_letters = {}
    for letter in string:
        if letter not in no_letters.keys():
            no_letters[letter] = 0
        no_letters[letter] += 1

    letter_counts = list(no_letters.items())
    letter_counts.sort(key=lambda l: l[0])

    text_info = bitarray()
    for letter, weight in letter_counts:
        l, w = bitarray(), bitarray()
        l.frombytes(letter.encode("utf-8"))
        w.frombytes(weight.to_bytes(4, byteorder="big", signed=False))
        text_info += l
        text_info += w
    text_info_length = bitarray()
    text_info_length.frombytes(
        len(letter_counts).to_bytes(1, byteorder="big", signed=False)
    )

    result = text_info_length + text_info
    huffman_dict, x = huffman(letter_counts)
    for letter in string:
        result += huffman_dict[letter]
    return result

## Algorithm decoding a file encoded with Huffman's code dictionary

In [4]:
def decode(bit_arr):
    n = int.from_bytes(bit_arr[:8], byteorder="big", signed=False)
    bit_arr = bit_arr[8:]

    letter_counts = []
    for i in range(n):
        letter = bit_arr[:8].tobytes().decode("utf-8")
        bit_arr = bit_arr[8:]
        weight = int.from_bytes(bit_arr[:32], byteorder="big", signed=False)
        bit_arr = bit_arr[32:]
        letter_counts.append([letter, weight])
    huffman_dict, root = huffman(letter_counts)
    i = 0
    result = ""
    while i != len(bit_arr):
        node = root
        while node.letter is None:
            if bit_arr[i] == 1:
                node = node.right
            else:
                node = node.left
            i += 1
        result += node.letter
    return result

## Functions and classes for adaptive Huffman encoding

In [5]:
class Node:
    def __init__(self, weight=0, left=None, right=None, letter=None, parent=None):
        self.weight = weight
        self.left = left
        self.right = right
        self.letter = letter
        self.parent = parent

    def __gt__(self, other):
        return self.weight > other.weight

    def __ge__(self, other):
        return self.weight >= other.weight


def create_dict(root):
    result_dict = {}

    def rec(node, code):
        nonlocal result_dict
        if node.left is None and node.right is None:
            result_dict[node.letter] = code
        else:
            lcode, rcode = code.copy(), code.copy()
            lcode.append(0)
            rcode.append(1)
            rec(node.left, lcode)
            rec(node.right, rcode)

    empty_code = bitarray()
    rec(root, empty_code)
    return result_dict


def swap(node1, node2):
    if node1.parent == node2.parent:
        parent = node1.parent
        parent.left, parent.right = parent.right, parent.left
        node1.parent, node2.parent = node2.parent, node1.parent
    else:
        if node1.parent.left == node1:
            node1.parent.left = node2
        else:
            node1.parent.right = node2
        if node2.parent.left == node2:
            node2.parent.left = node1
        else:
            node2.parent.right = node1
        node1.parent, node2.parent = node2.parent, node1.parent


def increment(node):
    update = False
    while node is not None:
        node.weight += 1
        if (
            node.left is not None
            and node.right is not None
            and node.left.weight > node.right.weight
        ):
            swap(node.left, node.right)
            update = True
        node = node.parent
    return update

## adaptive Huffman encoding

In [6]:
def adaptive_encode(string):
    nodes = {"#": Node(weight=0, letter="#")}
    root = nodes["#"]
    dict_now = create_dict(root)
    result = bitarray()
    for letter in string:
        if letter in nodes.keys():
            node = nodes[letter]

            result += dict_now[letter]

            update = increment(node)
            if update:
                dict_now = create_dict(root)
        else:
            updated_node = nodes["#"]

            result += dict_now["#"]
            l = bitarray()
            l.frombytes(letter.encode("utf-8"))
            result += l

            node = Node(weight=1, letter=letter, parent=updated_node)
            nodes[letter] = node
            del nodes["#"]
            zero_node = Node(weight=0, letter="#", parent=updated_node)
            updated_node.left, updated_node.right = zero_node, node
            nodes["#"] = zero_node

            increment(updated_node)
            dict_now = create_dict(root)
    return result

## adaptive Huffman decoding

In [7]:
def adaptive_decode(bit_arr):
    nodes = {"#": Node(weight=0, letter="#")}
    root = nodes["#"]

    i = 0
    result = ""
    while i != len(bit_arr):
        node = root
        while node.left is not None and node.right is not None:
            if bit_arr[i] == 1:
                node = node.right
            else:
                node = node.left
            i += 1

        if node.letter != "#":
            letter = node.letter
            result += letter
            node = nodes[letter]
            update = increment(node)

        else:

            letter = bit_arr[i : i + 8].tobytes().decode()
            i += 8
            result += letter

            updated_node = nodes["#"]
            node = Node(weight=1, letter=letter, parent=updated_node)
            nodes[letter] = node
            del nodes["#"]
            zero_node = Node(weight=0, letter="#", parent=updated_node)
            updated_node.left, updated_node.right = zero_node, node
            nodes["#"] = zero_node

            increment(updated_node)

    return result

In [8]:
print(decode(encode("Trudne sysopy w tym tygodniu bardzo")))

Trudne sysopy w tym tygodniu bardzo


In [9]:
print(
    adaptive_decode(adaptive_encode("Utrudzony jestem przez studia bardzo oj bardzo"))
)

Utrudzony jestem przez studia bardzo oj bardzo


In [10]:
def test(file_name):
    file = open(file_name, "r")
    text = file.read()
    file.close()

    t1 = time.perf_counter()
    encoded = encode(text)
    t2 = time.perf_counter()
    decoded = decode(encoded)
    t3 = time.perf_counter()
    file2 = open("tmp.rrr", "wb")
    encoded.tofile(file2)
    file2.close()
    size1 = os.path.getsize(file_name)
    size2 = os.path.getsize("tmp.rrr")
    os.remove("tmp.rrr")
    print(round(100 * size2 / size1, 2), "%", sep="")
    print("encoding", f"{t2-t1:.6}", "s")
    print("decoding", f"{t3-t2:.6}", "s")

    t1 = time.perf_counter()
    encoded = adaptive_encode(text)
    t2 = time.perf_counter()
    decoded = adaptive_decode(encoded)
    t3 = time.perf_counter()
    file2 = open("tmp.rrr", "wb")
    encoded.tofile(file2)
    file2.close()
    size1 = os.path.getsize(file_name)
    size2 = os.path.getsize("tmp.rrr")
    os.remove("tmp.rrr")
    print(round(100 * size2 / size1, 2), "%", sep="")
    print("adaptive encoding", f"{t2-t1:.6}", "s")
    print("adaptive decoding", f"{t3-t2:.6}", "s")

In [11]:
test("1kb.txt")

96.68%
encoding 0.0006396 s
decoding 0.0015138 s
319.53%
adaptive encoding 0.0112388 s
adaptive decoding 0.01936 s


In [12]:
test("10kb.txt")

74.38%
encoding 0.0031182 s
decoding 0.0129767 s
330.24%
adaptive encoding 0.103127 s
adaptive decoding 0.163649 s


In [13]:
test("100kb.txt")

72.29%
encoding 0.0312813 s
decoding 0.121605 s
330.75%
adaptive encoding 1.00544 s
adaptive decoding 1.7347 s


In [14]:
test("1mb.txt")

72.12%
encoding 0.332013 s
decoding 1.35124 s
331.3%
adaptive encoding 10.8308 s
adaptive decoding 17.0933 s


The results for adaptive algorithm are terrible. Why? The answer is simple. We're not using vitters algorithm that balances the tree. Instead we use the regular adaptive algorithm and size of a tree becomes linear. Since letters are randomly generated there's aproximately equal number of nodes encoded by |A| bits and by 1 bit (where |A| is a size of alphabet) 

### Out of pure curiousity I decided to see if a text similar to real language shows different behaviour than random characters

In [15]:
test("100kb_lorem_ipsum.txt")

53.45%
encoding 0.0279723 s
decoding 0.0980931 s
129.01%
adaptive encoding 0.408733 s
adaptive decoding 0.627759 s


Of course it does. The difference of 20 percentage points is astonishing. Objects created by humans follow rules like zipf's law or 80-20 rule or many other constraints. Randomly generated characters don't have that "human" property