# Rozwiązanie laboratorium 2

In [1]:
from queue import PriorityQueue

In [2]:
def read_file_to_string(filename):
    with open(filename, "r", encoding="UTF-8") as f:
        data = f.read()
    return data

In [3]:

class StaticNode:
    def __init__(self, character=None):
        self.character = character
        self.left = None
        self.right = None

def to_bin(x):
    binary = ""
    for _ in range(8):
        binary = str(x % 2) + binary
        x //= 2
    return binary

class StaticHuffman:
    def __init__(self):
        self.tree_root = None
        self.frequency_dict = {}
        self.code_dict = {}

    def build_frequency_dict(self, data):
        for c in data:
            if c not in self.frequency_dict:
                self.frequency_dict[c] = 1
            else:
                self.frequency_dict[c] += 1

    def build_tree(self):
        pq = PriorityQueue()
        for c in self.frequency_dict:
            pq.put((self.frequency_dict[c], c, StaticNode(c)))

        while True:
            freq1, str1, node1 = pq.get()
            if pq.empty():
                self.tree_root = node1
                return
            freq2, str2, node2 = pq.get()
            new_node = StaticNode()
            new_node.left = node1
            new_node.right = node2
            pq.put((freq1 + freq2, str1 + str2, new_node))

    def code_characters(self):
        def traverse_tree(node, code=""):
            if node.character is not None:
                self.code_dict[node.character] = code
            else:
                traverse_tree(node.left, code + '0')
                traverse_tree(node.right, code + '1')

        traverse_tree(self.tree_root, code="")

    def encode_text(self, text):
        encoded_text = ""
        for c in text:
            encoded_text += self.code_dict[c]

        padding_length = 8 - (len(encoded_text) % 8) if len(encoded_text) % 8 else 0
        encoded_text += "0"*padding_length
        encoded_text += to_bin(padding_length)
        return encoded_text




In [4]:
# file = read_file_to_string("test2")
# static_tree = StaticHuffman()
# static_tree.build_frequency_dict(file)
# static_tree.build_tree()
# static_tree.code_characters()
#
# to_file = static_tree.encode_text(file)
# def to_int(string):
#     val = 0
#     for x in string:
#         if x == '0':
#             val *= 2
#         else:
#             val = val*2 + 1
#     return val
#
# b = bytearray()
# for i in range(0, len(to_file), 8):
#     b.append(to_int(to_file[i:i+8]))
#
#
# with open("compressed", 'wb') as f:
#     f.write(b)

In [5]:
# with open("compressed", 'rb') as f:
#     contents = f.read()
#
# output = []
#
# root = static_tree.tree_root
# cur = root
# for i, b in enumerate(contents[:-2]):
#     mask = 1 << 7
#     for _ in range(8):
#         if mask & b:
#             cur = cur.right
#         else:
#             cur = cur.left
#         if cur.character is not None:
#             output.append(cur.character)
#             cur = root
#         mask >>= 1
#
# b = contents[-2]
# ile = 0 if contents[-1] == 0 else 8 - contents[-1]
# mask = 1 << 7
# for _ in range(ile):
#     if mask & b:
#         cur = cur.right
#     else:
#         cur = cur.left
#     if cur.character is not None:
#         output.append(cur.character)
#         cur = root
#     mask >>= 1
#
# with open("testcopy.txt", 'w', encoding="UTF-8") as f:
#     f.write("".join(output))

In [6]:
file_content = read_file_to_string("test2")

In [7]:
class Node:
    def __init__(self, index, weight, character, external):
        self.index = index
        self.weight = weight
        self.character = character
        self.external = external
        self.left = None
        self.right = None
        self.parent = None

    def __str__(self):
        return str(self.index)


def get_16_bit(x):
    code = ord(x)
    mask = 1 << 15
    bit_rep = ""
    for _ in range(16):
        bit_rep += "1" if mask & code else "0"
        mask >>= 1
    return bit_rep


def decode_16_bit(code):
    number = 0
    # print(code)
    for x in code:
        number = number*2 + (0 if x == '0' else 1)
    return chr(number)


class Tree:
    def __init__(self):
        self.root = Node(1000, 0, "NYT", True)
        self.NYT = self.root
        self.leaves = {None: None}
        self.free_index = 999
        self.weight = {}
        self.nodes = [self.root]

    def get_leaf_code(self, node):
        code = []
        while node != self.root:
            code.append("0" if node == node.parent.left else "1")
            node = node.parent
        return "".join(code)[::-1]

    def add_new_node(self, char):
        right_child = Node(self.free_index, 1, char, True)
        self.free_index -= 1
        left_child = Node(self.free_index, 0, "NYT", True)
        self.free_index -= 1

        internal = self.NYT
        self.NYT = left_child

        internal.weight = 1
        internal.character = ""
        internal.external = False
        internal.left = left_child
        internal.right = right_child

        right_child.parent = internal
        left_child.parent = internal

        self.leaves[char] = right_child
        self.nodes.append(right_child)
        self.nodes.append(left_child)
        self.update(internal.parent)

    def update(self, node):
        while node is not None and node != self.root:
            if node.parent.left == self.NYT:
                change = node
                for name in self.leaves:
                    if name is None:
                        continue
                    name = self.leaves[name]
                    if name.weight == node.weight and name.index > change.index:
                        change = name

                if change == node:
                    node.weight += 1
                else:
                    change.index, node.index = node.index, change.index
                    parent_change, parent_node = change.parent, node.parent
                    if parent_change.left == change:
                        if parent_node.left == node:
                            parent_change.left, parent_node.left = node, change
                        else:
                            parent_change.left, parent_node.right = node, change
                    else:
                        if parent_node.left == node:
                            parent_change.right, parent_node.left = node, change
                        else:
                            parent_change.right, parent_node.right = node, change

                    node.parent, change.parent = parent_change, parent_node
                    node.weight += 1
            else:
                change = node
                for name in self.nodes:
                    if name is None or name == self.root:
                        continue
                    if name.weight == node.weight and name.index > change.index:
                        change = name

                if change == node:
                    node.weight += 1
                else:
                    change.index, node.index = node.index, change.index
                    parent_change, parent_node = change.parent, node.parent
                    if parent_change.left == change:
                        if parent_node.left == node:
                            parent_change.left, parent_node.left = node, change
                        else:
                            parent_change.left, parent_node.right = node, change
                    else:
                        if parent_node.left == node:
                            parent_change.right, parent_node.left = node, change
                        else:
                            parent_change.right, parent_node.right = node, change

                    node.parent, change.parent = parent_change, parent_node
                    node.weight += 1

            node = node.parent

        if node == self.root:
            self.root.weight += 1

    def encode_text(self, text):
        encoded = []
        i = 0
        for c in text:
            if i % 10000 == 0:
                print(i)
            if c not in self.leaves:
                encoded.append(self.get_leaf_code(self.NYT))
                encoded.append(get_16_bit(c))
                self.add_new_node(c)
            else:
                encoded.append(self.get_leaf_code(self.leaves[c]))
                self.update(self.leaves[c])
            i += 1
        return "".join(encoded)


    def decode_text(self, text):
        decoded = []
        ind = 0
        while ind < len(text):
            ptr = self.root
            if ind % 100000 == 0:
                print(ind)
            while not ptr.external:
                ptr = ptr.left if text[ind] == '0' else ptr.right
                ind += 1

            if ptr == self.NYT:
                new_char = decode_16_bit(text[ind:ind+16])
                decoded.append(new_char)
                self.add_new_node(new_char)
                ind += 16
            else:
                decoded.append(ptr.character)
                self.update(ptr)

        return "".join(decoded)

In [8]:
tree = Tree()
output = tree.encode_text(file_content)
tree2 = Tree()
output2 = tree2.decode_text(output)
print(output2==file_content)

0
0
True


In [9]:

to_file = output
def to_int(string):
    val = 0
    for x in string:
        if x == '0':
            val *= 2
        else:
            val = val*2 + 1
    return val

b = bytearray()
for i in range(0, len(to_file), 8):
    b.append(to_int(to_file[i:i+8]))


with open("compressed", 'wb') as f:
    f.write(b)