## Exploring the reusability of the same Huffman coding tree for compressing similar data files
#### Naveen Narayanan Meyyappan and Praneeth Chandra Thota
#### Date:11/27/2019

In [2]:
# This file takes in commend from the user to perform Huffman encoding using the new approach
# Importing Library Files
import heapq
import binascii
import struct
import pickle as pkl
import os

# Defining the tree node in the heap structure
class TreeNode:
    def __init__(self, val=0, char=''):
        self.right = None
        self.left = None
        self.val = val
        self.char = char

    def __lt__(self, other):
        return self.val < other.val

# Defining a function to compute the frequencies of each character
def get_frequency(file):
    frequency = {}
    with open(file) as f:
        while True:
            c = f.read(1)
            if not c:
                break
            frequency[c] = 1 if c not in frequency else frequency[c]+1
    return frequency

# This function is used to build the Huffman encoding tree using the concept of binary heaps
def build_optimal_merge_tree(file):
    frequencies = get_frequency(file)
    heap = [TreeNode(frequencies[char], char) for char in frequencies]
    heapq.heapify(heap)
    while len(heap) > 1:
        left, right = heapq.heappop(heap), heapq.heappop(heap)
        parent = TreeNode(left.val+right.val)
        parent.left = left
        parent.right = right
        heapq.heappush(heap, parent)
    return heap[0]

# In this function we use the developed tree and assigns 0s and 1s to get the codeword
def build_encode_table(root):    
    encode_table = {}
    decode_table = {}
    stack = [["", root]]
    while stack:
        code, node = stack.pop(-1)
        if node.left:
            stack.append([code+'0', node.left])
        if node.right:
            stack.append([code+'1', node.right])
        if not node.left and not node.right:
            encode_table[node.char] = code
            decode_table[code] = node.char
    return encode_table, decode_table

# This is the function that encodes the file and generates the file. It basically maps each character to its code word
def encode_file(encode_table, input_file, output_file):    
    try:
        with open(input_file, "r") as input:
            lines = input.read()
            encoded_lines = '' 
            for c in lines:
                encoded_lines += encode_table[c]
            extra_padding = 8 - len(encoded_lines) % 8
            encoded_lines += "0"*extra_padding
            padded_info = "{0:08b}".format(extra_padding)
            encoded_lines = padded_info + encoded_lines
            byte_data = bytearray([int(encoded_lines[i:i+8], 2) for i in range(0, len(encoded_lines), 8)])
            with open(output_file, "wb") as output:
                output.write(bytes(byte_data))
        return True
    except:
        return False

# We have a similar function to generate the decoding table
def decode_file(decode_table, input_file, output_file):
    try:
        with open(input_file, 'rb') as input:
            byte = input.read(1)
            bit_data = ''
            while byte:
                bit_data += "{0:08b}".format(ord(byte.decode('ISO 8859-1')))
                byte = input.read(1)
            padding, bit_data = bit_data[:8], bit_data[8:]
            padding = int(padding, 2)
            bit_data = bit_data[:-padding]
            with open(output_file, "w") as output:
                code = ''
                for bit in bit_data:
                    code += bit
                    if code in decode_table:
                        output.write(decode_table[code])
                        code = ''
        return True
    except:
        return False
    
# This is the function that compresses files based on the given encoding table
def compress_file(input_file, output_file=""):
    encode_table = {}
    if not output_file:
        output_file = os.path.splitext(input_file)[0]+"_compressed.bin"
    try:
        with open('encoded.pkl', 'rb') as encoded_data:
            encode_table = pkl.load(encoded_data)
        return encode_file(encode_table, input_file, output_file)
    except IOError:
        print("File not found")
        return False
    except Exception as e:
        print(e)
        return False

# Decompressing function
def decompress_file(input_file, output_file=""):
    decode_table = {}
    if not output_file:
        output_file = os.path.splitext(input_file)[0]+"_decompressed.txt"
    try:
        with open('decoded.pkl', 'rb') as decode_data:
            decode_table = pkl.load(decode_data)
        return decode_file(decode_table, input_file, output_file)
    except IOError:
        print("File not Found")
        return False
    except Exception as e:
        print(e)
        return False

    
if __name__ == "__main__":
    
# The user interface

    print("Exploring the reusability of the same Huffman coding tree for compressing similar data files")
    print(" ")
    print("Project By Naveen Narayanan Meyyappan and Praneeth Chandra Thota")
    print (" ")
    print("Working Directory: ", os.getcwd())
    print(" ")
    
    action = ""
    input_file = ""
    while not input_file:
            input_file = input("Enter the master text file for generating the encoding table: ").strip()
            if input_file and os.path.splitext(input_file)[-1] != '.txt':
                print("Please enter a valid file format!!")
                input_file = ""
    root = build_optimal_merge_tree(input_file)
    encode_table, decode_table = build_encode_table(root)            
    output = open('encodingtable.pkl', 'wb')
    pkl.dump(encode_table, output)
    output.close()
    output = open('decodingtable.pkl', 'wb')
    pkl.dump(decode_table, output)
    output.close()   
    print("Encoding and Decoding tables generated")
    print(" ")
    statinfoenctable = os.stat('encodingtable.pkl')
    print("Size of the encoding and decoding tables: ", statinfoenctable.st_size)
    
    while (1):
        action = input("Enter the action to be performed (compress/decompress/exit): ").lower().strip()
        input_file = ""
        if action == "compress":
            while not input_file:
                input_file = input("Enter the text file to be compressed: ").strip()
                if input_file and os.path.splitext(input_file)[-1] != '.txt':
                    print("Please enter a valid file format!!")
                    input_file = ""
            if compress_file(input_file):
                print(input_file,"Compression Successful")
            else:
                print(input_file,"Compression Failed")
        elif action == "decompress":
            while not input_file:
                input_file = input("Enter the text file to be decompressed: ").strip()
                if input_file and os.path.splitext(input_file)[-1] != '.bin':
                    print("Please enter a valid file format!!")
                    input_file = ""

            if decompress_file(input_file):
                print(input_file,"Decompression Successful")
            else:
                print(input_file,"Decompression Failed")
        elif action=="exit":
            break

Exploring the reusability of the same Huffman coding tree for compressing similar data files
 
Project By Naveen Narayanan Meyyappan and Praneeth Chandra Thota
 
Working Directory:  C:\Users\USER\huffman
 
Enter the master text file for generating the encoding table: data.txt
Encoding and Decoding tables generated
 
Size of the encoding and decoding tables:  2635
Enter the action to be performed (compress/decompress/exit): exit
