In [None]:
from collections import Counter

#node class for Huffman tree
class Node:
    def __init__(self):
        self.info = '\0'
        self.freq = 0
        self.code = ''
        self.Llink = None
        self.Rlink = None

# Basic Binary Tree structure for Huffman coding
class BinaryTree:
    def __init__(self):
        self.root = None

    # Print tree node—ã(for debugging)
    def print_tree(self):
        if self.root is None:
            return
        print(self.root.info, "\t", self.root.freq)
        if self.is_leaf(self.root):
            return

        left = BinaryTree()
        right = BinaryTree()
        left.root = self.root.Llink
        right.root = self.root.Rlink

        left.print_tree()
        right.print_tree()

    # Check if node is leaf
    def is_leaf(self, nd):
        return nd.info != '\0'

    # Assign 0/1 codes to leaves
    def assign_code(self, current_code=''):
        if self.root is None:
            return

        if self.is_leaf(self.root):
            self.root.code = current_code
            print(self.root.info, "\t", self.root.code)
            return

        left = BinaryTree()
        right = BinaryTree()
        left.root = self.root.Llink
        right.root = self.root.Rlink

        left.assign_code(current_code + '0')
        right.assign_code(current_code + '1')

    # Get encoded value for one character
    def get_code(self, c):
        if self.root is None:
            return ''

        if self.is_leaf(self.root):
            return self.root.code if self.root.info == c else ''

        left = BinaryTree()
        right = BinaryTree()
        left.root = self.root.Llink
        right.root = self.root.Rlink

        res = left.get_code(c)
        if res:
            return res
        return right.get_code(c)

    # Encode whole string
    def encode(self, text):
        output = ""
        for ch in text:
            output += self.get_code(ch)
        return output

    # Check prefix match
    def match_prefix(self, a, b, length):
        if len(a) < length:
            return False
        for i in range(length):
            if a[i] != b[i]:
                return False
        return True

    # Decode Huffman bits
    def decode(self, code, size):
        decoded = ""
        idx = 0

        while idx < size:
            flag = 0
            length = 0

            while flag == 0:
                length += 1
                flag, symbol = self.find_symbol(code[idx:], length)

                if flag == 1:
                    decoded += symbol

            idx += length

        return decoded

    # Helper for decoding
    def find_symbol(self, bits, length):
        if self.is_leaf(self.root):
            if self.match_prefix(self.root.code, bits, length):
                return 1, self.root.info
            return 0, ''

        left = BinaryTree()
        right = BinaryTree()
        left.root = self.root.Llink
        right.root = self.root.Rlink

        f, s = left.find_symbol(bits,length)
        if f == 1:
            return f, s
        return right.find_symbol(bits, length)

    # Simple tokenizer(splits into subwords)
    def tokenize(self, text):
        tokens = []
        words = text.split()

        for w in words:
            if len(w) <= 4:
                tokens.append(w)
            else:
                i = 0
                while i < len(w):
                    end = min(i+4, len(w))
                    sub = w[i:end]
                    if len(sub) >= 2:
                        tokens.append(sub)
                    i += 3
        return tokens

    # Encode tokens
    def encode_tokens(self, tokens):
        result = []
        for t in tokens:
            result.append(self.encode(t))
        return result

    # Split compressed bits into smaller chunks
    def re_tokenize_compressed(self, encoded_tokens):
        parts = []
        for enc in encoded_tokens:
            for i in range(0, len(enc), 6):
                parts.append(enc[i:i+6])
        return parts


# Simple min-heap for Huffman tree creation
class MinHeap:
    def __init__(self):
        self.T = []
        self.n = 0

    def heapify(self, i):
        while True:
            left = 2*i + 1
            right = 2*i + 2
# Simple Huffman Coding + Tokenizer + BPE example
# -----------------------------------------
# Tree node for Huffman
# -----------------------------------------
class Node:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = None
        self.right = None

# -----------------------------------------
# Min-heap implementation
# -----------------------------------------
class MinHeap:
    def __init__(self):
        self.heap = []

    def insert(self, node):
        self.heap.append(node)
        self._up(len(self.heap) - 1)

    def remove_min(self):
        if len(self.heap) == 0:
            return None
        self._swap(0, len(self.heap) - 1)
        min_node = self.heap.pop()
        self._down(0)
        return min_node

    # move up
    def _up(self, idx):
        while idx > 0:
            parent = (idx - 1) // 2
            if self.heap[idx].freq < self.heap[parent].freq:
                self._swap(idx, parent)
                idx = parent
            else:
                break

    # move down
    def _down(self, idx):
        size = len(self.heap)
        while True:
            left = idx * 2 + 1
            right = idx * 2 + 2
            smallest = idx

            if left < size and self.heap[left].freq < self.heap[smallest].freq:
                smallest = left
            if right < size and self.heap[right].freq < self.heap[smallest].freq:
                smallest = right

            if smallest == idx:
                break

            self._swap(idx, smallest)
            idx = smallest

    def _swap(self, i, j):
        self.heap[i], self.heap[j] = self.heap[j], self.heap[i]

# -----------------------------------------
# Build Huffman tree and generate codes
# -----------------------------------------
def build_huffman_tree(char_freq):
    heap = MinHeap()

    # push all chars into heap
    for c, f in char_freq.items():
        heap.insert(Node(c, f))

    # combine nodes until one left
    while len(heap.heap) > 1:
        left = heap.remove_min()
        right = heap.remove_min()

        new = Node("*", left.freq + right.freq)
        new.left = left
        new.right = right

        heap.insert(new)

    return heap.remove_min()

# generate codes recursively
def generate_codes(root, code="", table=None):
    if table is None:
        table = {}

    if root is None:
        return table

    # leaf node
    if root.left is None and root.right is None:
        table[root.char] = code
        return table

    generate_codes(root.left, code + "0", table)
    generate_codes(root.right, code + "1", table)

    return table

# -----------------------------------------
# Simple tokenizer (just splits into words)
# -----------------------------------------
def tokenize(text):
    return text.split()

# -----------------------------------------
# Very small "fake" BPE (just merges pairs once)
# -----------------------------------------
def simple_bpe(tokens):
    bpe_tokens = []
    for t in tokens:
        # merge simple pairs (demo only)
        if len(t) > 2:
            merged = t[0:2] + "_" + t[2:]
            bpe_tokens.append(merged)
        else:
            bpe_tokens.append(t)
    return bpe_tokens

# -----------------------------------------
# Encode text using Huffman codes
# -----------------------------------------
def encode_text(text, codes):
    result = ""
    for ch in text:
        if ch in codes:
            result += codes[ch]
    return result

# decode using Huffman tree
def decode_text(bits, root):
    output = ""
    curr = root

    for b in bits:
        if b == "0":
            curr = curr.left
        else:
            curr = curr.right

        if curr.left is None and curr.right is None:
            output += curr.char
            curr = root

    return output

# -----------------------------------------
# Example data
# -----------------------------------------

char_freq = {
    "a": 5,
    "b": 9,
    "c": 12,
    "d": 13
}

# build huffman tree
root = build_huffman_tree(char_freq)

# get huffman codes
codes = generate_codes(root)

# example text
text = "abcd abc"

# tokenize
tokens = tokenize(text)

# apply simple BPE
bpe_tokens = simple_bpe(tokens)

# encode text
encoded = encode_text(text.replace(" ", ""), codes)

# decode text
decoded = decode_text(encoded, root)

# -----------------------------------------
# Final
# -----------------------------------------
print("Huffman Codes:")
print(codes)

print("\nTokens:")
print(tokens)

print("\nAfter BPE:")
print(bpe_tokens)

print("\nEncoded text:")
print(encoded)

print("\nDecoded text:")
print(decoded)




Huffman Codes:
{'a': '00', 'b': '01', 'c': '10', 'd': '11'}

Tokens:
['abcd', 'abc']

After BPE:
['ab_cd', 'ab_c']

Encoded text:
00011011000110

Decoded text:
abcdabc
