In [1]:
from heapq import heappush, heappop
from bitarray import bitarray
import time
import os

In [2]:
def huffman(letter_counts):
    # Define a Node class to store each tree element
    class Node:
        def __init__(self, weight, left=None, right=None, letter=None):
            self.weight = weight    # Total frequency or weight
            self.left = left        # Left child (Node)
            self.right = right      # Right child (Node)
            self.letter = letter    # Character/letter if this is a leaf node

        # Define comparison operators so the heapq module can sort Nodes
        def __lt__(self, other):  # < operator for heapq to work properly
            return self.weight < other.weight

        def __gt__(self, other):  # Optional, for completeness
            return self.weight > other.weight

        def __ge__(self, other):  # Optional, not needed by heapq
            return self.weight >= other.weight

    # Initialize a heap with leaf nodes
    result = []
    for letter, weight in letter_counts:
        heappush(result, Node(weight, letter=letter))

    # Build the Huffman tree by combining the two lowest-weight nodes
    while len(result) > 1:
        element_1 = heappop(result)
        element_2 = heappop(result)
        # Create a new internal node with combined weight and push it back
        heappush(result, Node(element_1.weight + element_2.weight, element_1, element_2))

    result_dict = {}

    # Recursive function to traverse the tree and assign bit codes
    def rec(node, code):
        nonlocal result_dict
        if node.letter is not None:
            # Leaf node: assign the accumulated code
            result_dict[node.letter] = code
            return
        # Internal node: recurse left with 0, right with 1
        lcode = code.copy()
        rcode = code.copy()
        lcode.append(0)
        rcode.append(1)
        rec(node.left, lcode)
        rec(node.right, rcode)

    # Start recursive traversal from the root of the Huffman tree
    empty_code = bitarray()
    rec(result[0], empty_code)

    return result_dict, result[0]

In [30]:
huffman([("a",100), ("b", 100), ("c", 100),("e", 100),("d", 100),("f", 100)])

({'b': bitarray('00'),
  'e': bitarray('01'),
  'f': bitarray('100'),
  'd': bitarray('101'),
  'a': bitarray('110'),
  'c': bitarray('111')},
 <__main__.huffman.<locals>.Node at 0x25484e0b850>)

In [78]:
def encode(string):
    # Count the frequency of each character in the string
    no_letters = {}
    for letter in string:
        if letter not in no_letters.keys():
            no_letters[letter] = 0
        no_letters[letter] += 1

    # Create a sorted list of (letter, count) pairs
    letter_counts = list(no_letters.items())
    letter_counts.sort(key=lambda l: l[0])  # Sorting by letter for consistency

    # The first byte of result array is the alphabet size
    result = bitarray()
    result.frombytes(
        len(letter_counts).to_bytes(1, byteorder="big", signed=False)
    )

    # Then for all letters in the alphabet...
    l, w = bitarray(), bitarray()
    for letter, count in letter_counts:
        # Encode the letter as 1 byte
        l.frombytes(letter.encode("utf-8"))
        # Encode count as 4 bytes
        w.frombytes(count.to_bytes(4, byteorder="big", signed=False))
        # and append them to the result
        result += l + w
        l.clear()
        w.clear()

    # Generate Huffman dictionary from letter frequencies
    huffman_dict, _ = huffman(letter_counts)

    # Encode the actual string using the Huffman codes
    for letter in string:
        result += huffman_dict[letter]  # Append Huffman bit sequence for each letter

    return result  # Final encoded bitarray


In [80]:
def decode(bit_arr):
    # Read the alphabet size stored as the first byte
    n = int.from_bytes(bit_arr[:8], byteorder="big", signed=False)
    bit_arr = bit_arr[8:]

    # Reconstruct the frequency table using the rest of the encoding bit array
    letter_counts = []
    for i in range(n):
        # First encode one byte that represent the letter
        letter = bit_arr[:8].tobytes().decode("utf-8")
        bit_arr = bit_arr[8:]

        # Then next 4 bytes represent the number of its occurrences in original text
        count = int.from_bytes(bit_arr[:32], byteorder="big", signed=False)
        bit_arr = bit_arr[32:]
        letter_counts.append([letter, count])

    # Rebuild the Huffman tree using the frequencies
    huffman_dict, root = huffman(letter_counts)

    # Decode the remaining bits using the Huffman tree
    i = 0
    result = ""
    while i < len(bit_arr):
        node = root
        # Traverse the tree until a leaf node is found
        while node.letter is None:
            if bit_arr[i] == 1:
                node = node.right
            else:
                node = node.left
            i += 1
        result += node.letter  # Append the decoded character

    return result  # Final decoded string

In [81]:
print(decode(encode("Trudne sysopy w tym tygodniu bardzo")))

Trudne sysopy w tym tygodniu bardzo
