# Huffman Coding

Given a set of n characters from the alphabet A(with each c E A) and their associated frequencies (freq(c)).

The aim is to create a binary code that represents each character in A such that the freq(c) * len(binary_code(c)) is minimal.

In [1]:
characters = {
    'a': 12,
    'b': 2,
    'c': 7,
    'd': 13,
    'e': 14,
    'f': 85,
    'g': 125
}

## Priority heap node to store string and frequency

In [2]:
class HuffmanNode:
    def __init__(self, char, freq):
        self.char = char
        self.freq = freq
        self.left = self.right = None

## Binary MinHeap Definition

In [3]:
class MinHeap:
    def __init__(self):
        self.arr = []
    
    def left_child(self, i):
        return (2*i+1 if 2*i+1 < len(self.arr) else None)
    
    def right_child(self, i):
        return (2*i+2 if 2*i+2 < len(self.arr) else None)
    
    def insert(self, data):
        self.arr.append(None)
        i = len(self.arr) - 1
        while i > 0 and data.freq < self.arr[(i-1)//2].freq:
            self.arr[i] = self.arr[(i-1)//2]
            i = (i-1) // 2
            
        self.arr[i] = data
        
    def percolate_down(self, i):
        l = self.left_child(i)
        r = self.right_child(i)
        
        if l and self.arr[l].freq < self.arr[i].freq:
            min_ = l
        else:
            min_ = i
        if r and self.arr[r].freq < self.arr[min_].freq:
            min_ = r
            
        if min_ != i:
            self.arr[i], self.arr[min_] = self.arr[min_], self.arr[i]
            self.percolate_down(min_)
            
    def delete_min(self):
        if not self.arr:
            return None
        
        min_ = self.arr[0]
        self.arr[0] = self.arr[-1]
        del self.arr[-1]
        self.percolate_down(0)
        
        return min_
    
    def build_heap(self):
        i = (len(self.arr) - 1) // 2
        
        while i >= 0:
            self.percolate_down(i)
            i -= 1
            
    def create_heap(self, characters):
        for k, v in characters.items():
            self.arr.append(HuffmanNode(k, v))
            
        self.build_heap()

## Binary MinHeap Creation

In [4]:
min_heap = MinHeap()
min_heap.create_heap(characters)

### Display MinHeap

In [5]:
for node in min_heap.arr:
    print('{}: {}'.format(node.char, node.freq), end=', ')

b: 2, a: 12, c: 7, d: 13, e: 14, f: 85, g: 125, 

## Huffman Coding

Use '$' to define new huffman node.

In [6]:
for i in range(len(characters) - 1):
    temp = HuffmanNode('$', -1)
    temp.left = min_heap.delete_min()
    temp.right = min_heap.delete_min()
    temp.freq = temp.left.freq + temp.right.freq
    min_heap.insert(temp)

## Level Order Traversal of Huffman Tree

### Queue Implementation

In [7]:
class Queue:
    def __init__(self):
        self.arr = []
        
    def is_empty(self):
        return (True if not self.arr else False)
        
    def enqueue(self, x):
        self.arr.append(x)
        
    def dequeue(self):
        data = self.arr[0]
        del self.arr[0]
        
        return data

The MinHeap has only one node now, which is the root of the Huffman Tree.

In [8]:
root = min_heap.arr[0]

In [9]:
def level_order(root):
    queue = Queue()
    queue.enqueue(root)
    
    while not queue.is_empty():
        data = queue.dequeue()
        print('{} --> {}'.format(data.char, data.freq))
        if data.left:
            queue.enqueue(data.left)
        if data.right:
            queue.enqueue(data.right)

In [10]:
level_order(root)

$ --> 258
g --> 125
$ --> 133
$ --> 48
f --> 85
$ --> 21
$ --> 27
$ --> 9
a --> 12
d --> 13
e --> 14
b --> 2
c --> 7


Encode every left node as '0' and right node as '1' to calculate the Huffman code for the corresponding character. The characters are in the leaf nodes.

In [11]:
def print_dict(dict_, idx):
    for i in range(idx):
        print(dict_[i], end=' ')
    print('')

In [12]:
def print_codes(root, dict_, idx):
    if root.left:
        dict_[idx] = 0
        print_codes(root.left, dict_, idx+1)
    if root.right:
        dict_[idx] = 1
        print_codes(root.right, dict_, idx+1)
    if not root.left and not root.right:
        print('{} ({}) --> '.format(root.char, root.freq), end=' ')
        print_dict(dict_, idx)

In [13]:
print_codes(root, {}, 0)

g (125) -->  0 
b (2) -->  1 0 0 0 0 
c (7) -->  1 0 0 0 1 
a (12) -->  1 0 0 1 
d (13) -->  1 0 1 0 
e (14) -->  1 0 1 1 
f (85) -->  1 1 


Huffman encoding creates binary codes that are shorter for keys that have a larger frequency. In the above example, 'g' and 'f' have the largest frequencies but their Huffman codes are the smallest.