# Rozwiązanie laboratorium 2

In [165]:
from queue import PriorityQueue
from time import perf_counter
import pandas as pd
import numpy as np

Funkcje konwertujące

In [154]:
def string_to_int(string):
    val = 0
    for x in string:
        val = val << 1 | (0 if x == '0' else 1)
    return val


def int_to_string(x, no_of_bits):
    string = []
    mask = 1 << (no_of_bits - 1)
    for _ in range(no_of_bits):
        string.append("1" if x & mask else "0")
        mask >>= 1
    return "".join(string)

##### Wczytytywanie i odczytywanie z plików binarnych/tekstowych

Na końcu każdego pliku dodaje dodatkowe bity, aby długość pliku była podzielna przez 8. Ostatni bajt zawiera informacje ile dokładnie bitów zostało dodane.
Przy wczytywaniu z pliku te ostatnie bity są pomijane


In [155]:
def read_file_to_string(filename):
    with open(filename, "r", encoding="UTF-8") as f:
        data = f.read()
    return data


def read_binary_file_to_string(filename):
    with open(filename, 'rb') as f:
        bit_data = f.read()

    data = []
    for bit in bit_data[:-2]:
        data.append(int_to_string(bit, 8))
    last = ""
    mask = 1 << 7
    if bit_data[-1]:
        for j in range(bit_data[-1]):
            last += "1" if mask & bit_data[-2] else "0"
            mask >>= 1
    else:
        for j in range(8):
            last += "1" if mask & bit_data[-2] else "0"
            mask >>= 1
    data.append(last)
    return "".join(data)


def write_string_to_binary_file(filename, text):
    b = bytearray()
    for i in range(0, len(text), 8):
        b.append(string_to_int(text[i:i+8]))

    with open(filename, 'wb') as f:
        f.write(b)

In [156]:
def add_last_bits(text):
    padding_length = len(text) % 8
    text += "0" * ((8 - padding_length)%8)
    text += int_to_string(padding_length, 8)
    return text

### Statyczne drzewo Huffmana

Przy kompresowaniu pliku początkowo zakodowane litery zapisuje jako zera i jedynki (znaki ascii). Też do takiej formy
zapisuje wczytywany plik. Dopiero potem znaki odpowiednio konweruje na liczby i zapisuje do plików. Używanie stringów było jednak najwygodniejsze.

In [157]:
class StaticNode:
    def __init__(self, character=None):
        self.character = character
        self.left = None
        self.right = None

class StaticHuffman:
    def __init__(self):
        self.tree_root = None
        self.frequency_dict = {}
        self.code_dict = {}

    def build_frequency_dict(self, data):
        for c in data:
            if c not in self.frequency_dict:
                self.frequency_dict[c] = 1
            else:
                self.frequency_dict[c] += 1

    def build_tree(self):
        pq = PriorityQueue()
        for c in self.frequency_dict:
            pq.put((self.frequency_dict[c], c, StaticNode(c)))

        while True:
            freq1, str1, node1 = pq.get()
            if pq.empty():
                self.tree_root = node1
                return
            freq2, str2, node2 = pq.get()
            new_node = StaticNode()
            new_node.left = node1
            new_node.right = node2
            pq.put((freq1 + freq2, str1 + str2, new_node))

    def code_characters(self):
        def traverse_tree(node, code=""):
            if node.character is not None:
                self.code_dict[node.character] = code
            else:
                traverse_tree(node.left, code + '0')
                traverse_tree(node.right, code + '1')

        traverse_tree(self.tree_root, code="")

    def encode_text(self, text):
        self.build_frequency_dict(text)
        self.build_tree()
        self.code_characters()
        encoded = []
        for c in text:
            encoded.append(self.code_dict[c])
        return add_last_bits("".join(encoded))

    def decode_text(self, text):
        ind = 0
        decoded = []
        while ind < len(text):
            ptr = self.tree_root
            while ptr.character is None:
                ptr = ptr.left if text[ind] == '0' else ptr.right
                ind += 1
            decoded.append(ptr.character)
        return "".join(decoded)

### Dynamiczne drzewo Huffmana

Podobnie jak w statycznym drzewie Huffmana w samym programie używam stringów.
Moja implementacja bazuje na algorytmie FGK(Faller-Gallager-Knuth). Otrzymana struktura nie jest najszybsza.
Można by to poprawić, gdyby struktury self.node_weights i self.leaf_weights ( atrybuty klasy AdaptiveHuffman ), byłyby listami posortowanych
zbiorór (np. drzew AVL), a nie zwykłych zbiorów.
Ponadto, jako że przy dekodowaniu w dynamicznym drzewie Huffmana drzewo jest tworzone na nowo i nie wiadomo jakie litery mogą się pojawić, w momencie, gdy
po raz pierwszy napotykamy musimy ją zapisać w kodzie "normalnie". W pythonie jest dostępna funkcja ord(), która dla znaku zwraca liczbę z kodu ascii
Działa ona jednak dla każdej znaku z UTF-8 ( większość plików chociażby z gutenberg.ord jest w takim formacie). Niektóre jednak znaki nie mieszczą się na jednym bajcie, lecz dopiero na dwóch. Dlatego przy kodowaniu nowo napotkanego znaku używałem zawsze 16 bitów. Dany znak jest tak zapisywany tylko raz, więc nie ma to znaczenia dla plików, które są większe niż kilka kilobajtów, lecz dla małych plików, w których jest dużo różnych znaków współczynnik kompresji może być bardzo niski.

In [158]:
class AdaptiveNode:
    def __init__(self, index, weight, character, external):
        self.index = index
        self.weight = weight
        self.character = character
        self.external = external
        self.left = None
        self.right = None
        self.parent = None


def interchange(node, change):
    if change != node:
        change.index, node.index = node.index, change.index
        parent_change, parent_node = change.parent, node.parent
        if parent_change.left == change:
            if parent_node.left == node:
                parent_change.left, parent_node.left = node, change
            else:
                parent_change.left, parent_node.right = node, change
        else:
            if parent_node.left == node:
                parent_change.right, parent_node.left = node, change
            else:
                parent_change.right, parent_node.right = node, change
        node.parent, change.parent = parent_change, parent_node


def update_weight(node, node_dict):
    node_dict[node.weight].remove(node)
    if node.weight + 1 >= len(node_dict):
        node_dict.append(set())
    node_dict[node.weight + 1].add(node)


class AdaptiveHuffman:
    def __init__(self):
        self.root = AdaptiveNode(1000, 0, "NYT", True)
        self.NYT = self.root
        self.free_index = 999
        self.leaves = {}
        self.node_weights = [set(), set()]
        self.leaf_weights = [set(), set()]

    def get_leaf_code(self, node):
        code = []
        while node != self.root:
            code.append("0" if node == node.parent.left else "1")
            node = node.parent
        return "".join(code)[::-1]

    def add_new_node(self, char):
        right_child = AdaptiveNode(self.free_index, 1, char, True)
        self.free_index -= 1
        left_child = AdaptiveNode(self.free_index, 0, "NYT", True)
        self.free_index -= 1

        internal, self.NYT = self.NYT, left_child

        internal.weight = 1
        internal.character = ""
        internal.external = False
        internal.left = left_child
        internal.right = right_child

        right_child.parent = internal
        left_child.parent = internal

        self.leaves[char] = right_child
        self.node_weights[1].add(right_child)
        if internal != self.root:
            self.node_weights[1].add(internal)

        self.leaf_weights[1].add(right_child)
        self.update(internal)

    def update(self, node):
        while node != self.root:
            if node.parent.left == self.NYT:
                change = max(self.leaf_weights[node.weight], key=lambda item: item.index)
                interchange(node, change)
            else:
                change = max(self.node_weights[node.weight], key=lambda item: item.index)
                interchange(node, change)

            if node.external:
                update_weight(node, self.leaf_weights)
            update_weight(node, self.node_weights)
            node.weight += 1
            node = node.parent

        self.root.weight += 1

    def encode_text(self, text):
        encoded = []
        for c in text:
            if c not in self.leaves:
                encoded.append(self.get_leaf_code(self.NYT))
                encoded.append(int_to_string(ord(c), 16))
                self.add_new_node(c)
            else:
                encoded.append(self.get_leaf_code(self.leaves[c]))
                self.update(self.leaves[c])

        return add_last_bits("".join(encoded))


    def decode_text(self, text):
        decoded = []
        ind = 0
        while ind < len(text):
            ptr = self.root
            while not ptr.external:
                ptr = ptr.left if text[ind] == '0' else ptr.right
                ind += 1

            if ptr == self.NYT:
                new_char = chr(string_to_int(text[ind:ind+16]))
                decoded.append(new_char)
                self.add_new_node(new_char)
                ind += 16
            else:
                decoded.append(ptr.character)
                self.update(ptr)
        return "".join(decoded)

#### Współczynniki kompresji i testy czasowe

In [172]:
from os import listdir
from os.path import isfile, join
from os.path import getsize

files1 = ["Gutenberg_files/" + f for f in listdir("Gutenberg_files") if isfile(join("Gutenberg_files", f))]
files2 = ["random_files/" + f for f in listdir("random_files") if isfile(join("random_files", f))]
files3 = ["Linux_kernel/" + f for f in listdir("Linux_kernel") if isfile(join("Linux_kernel", f))]
files = files1 + files2 + files3

static_tree_compression_time = []
dynamic_tree_compression_time = []
static_tree_decompression_time = []
dynamic_tree_decompression_time = []
static_tree_compression_ratio = []
dynamic_tree_compression_ratio = []

static_tree_correctness = []
dynamic_tree_correctness = []


In [None]:
for file in files:
    static_tree = StaticHuffman()
    adaptive_encoder = AdaptiveHuffman()
    adaptive_decoder = AdaptiveHuffman()
    f_content = read_file_to_string(file)
    t = perf_counter()
    compressed = static_tree.encode_text(f_content)
    t = perf_counter() - t
    static_tree_compression_time.append(t)

    write_string_to_binary_file("compressed.bin", compressed)
    to_decode = read_binary_file_to_string("compressed.bin")
    t = perf_counter()
    decompressed = static_tree.decode_text(to_decode)
    t = perf_counter() - t
    static_tree_decompression_time.append(t)
    static_tree_correctness.append(decompressed == f_content)
    static_tree_compression_ratio.append(1 - (getsize('compressed.bin') / getsize(file)))


    t = perf_counter()
    compressed = adaptive_encoder.encode_text(f_content)
    t = perf_counter() - t
    dynamic_tree_compression_time.append(t)

    write_string_to_binary_file("compressed.bin", compressed)
    to_decode = read_binary_file_to_string("compressed.bin")

    t = perf_counter()
    decompressed = adaptive_decoder.decode_text(to_decode)
    t = perf_counter() - t

    dynamic_tree_decompression_time.append(t)
    dynamic_tree_correctness.append(decompressed == f_content)
    dynamic_tree_compression_ratio.append(1 - (getsize('compressed.bin') / getsize(file)))


#### Statyczne kodowanie Huffmana

In [185]:
data = {
    'File Name': files,
    'Compression time': np.round(np.array(static_tree_compression_time), 2),
    'Decompression time': np.round(np.array(static_tree_decompression_time), 2),
    'Compression ratio': np.round(np.array(static_tree_compression_ratio), 2),
    'Check correctness': static_tree_correctness
}
df = pd.DataFrame(data)
df.style.set_caption("Static Huffman Tree results")
display(df)

Unnamed: 0,File Name,Compression time,Decompression time,Compression ratio,Check correctness
0,Gutenberg_files/book.txt,0.0,0.0,0.45,True
1,Gutenberg_files/mickiewicz.txt,0.02,0.02,0.43,True
2,Gutenberg_files/moby_dick.txt,0.68,0.98,0.45,True
3,Gutenberg_files/tom_sawyer.txt,0.08,0.12,0.45,True
4,random_files/file_size_1000kb.txt,0.52,1.02,0.14,True
5,random_files/file_size_100kb.txt,0.03,0.09,0.14,True
6,random_files/file_size_10kb.txt,0.0,0.01,0.14,True
7,random_files/file_size_1kb.txt,0.0,0.0,0.16,True
8,Linux_kernel/makefile.txt,0.0,0.0,0.33,True
9,Linux_kernel/merged.txt,0.32,0.97,0.36,True


#### Dynamiczne kodowanie Huffmana

In [186]:
data = {
    'File Name': files,
    'Compression time': np.round(np.array(dynamic_tree_compression_time), 2),
    'Decompression time': np.round(np.array(dynamic_tree_decompression_time), 2),
    'Compression ratio': np.round(np.array(dynamic_tree_compression_ratio), 2),
    'Check correctness': static_tree_correctness
}
df = pd.DataFrame(data)
display(df)

Unnamed: 0,File Name,Compression time,Decompression time,Compression ratio,Check correctness
0,Gutenberg_files/book.txt,0.01,0.01,0.3,True
1,Gutenberg_files/mickiewicz.txt,0.14,0.18,0.42,True
2,Gutenberg_files/moby_dick.txt,13.43,11.64,0.45,True
3,Gutenberg_files/tom_sawyer.txt,1.55,2.98,0.45,True
4,random_files/file_size_1000kb.txt,13.48,12.42,0.14,True
5,random_files/file_size_100kb.txt,1.26,1.17,0.14,True
6,random_files/file_size_10kb.txt,0.14,0.13,0.12,True
7,random_files/file_size_1kb.txt,0.05,0.05,0.07,True
8,Linux_kernel/makefile.txt,0.01,0.01,0.14,True
9,Linux_kernel/merged.txt,11.42,10.37,0.36,True


### Algorytm o zmiennym bloku kompresji LZW (Lempel–Ziv–Welch)

Algorytm zakłada, że przy kompresji i dekompresji znany jest alfabet pojedynczych znaków jakie można napotkać w pliku. Jednak dla liczb powyżej 130 rzucała błędy, więc
słownik stworzyłem iterując po wszystkich plikach jakie będę kompresował.

In [187]:
def gather_characters(all_files):
    d = set()
    for f in all_files:
        text = read_file_to_string(f)
        for x in text:
            d.add(x)
    return d

In [188]:
characters = list(gather_characters(files))

Algorytm ten w przeciwieństwie do poprzednich nie zwraca od razu ciągu zer i jedynek, tylko listę, w której są pojedyncze znaki albo kody słów, które powstały przez konkatenacje
tych znaków. W algorytmie dodałem warunek, że ilość kodów nie może przekraczać liczby 65535, żeby wszystkie kody mogły zmieścić się na dwóch bajtach. To założenie nie wpływa znacząco na wynik, a późniejsze wczytywanie do pliku binarnego i dekompresja jest znacznie łatwiejsza.

In [200]:
def read_binary_file_to_lzw_decoding(filename):
    text = read_binary_file_to_string(filename)
    array = []
    ind = 0
    while ind < len(text):
        if text[ind] == "1":
            ind += 1
            array.append(chr(string_to_int(text[ind:ind+16])))
        else:
            ind += 1
            array.append(string_to_int(text[ind:ind+16]))
        ind += 16

    return array


In [201]:
def write_lzw_code_to_binary(filename, coded):
    binary = []
    for x in coded:
        if isinstance(x, str):
            binary.append("1")
            binary.append(int_to_string(ord(x), 16))
        else:
            binary.append("0")
            binary.append(int_to_string(x, 16))
    write_string_to_binary_file("compressed.bin", add_last_bits("".join(binary)))


In [202]:
def encoding(text):
    dictionary = {x:x for x in characters}
    last = 1
    encoded = []
    word = ""
    for x in text:
        if word + x in dictionary:
            word += x
        else:
            encoded.append(dictionary[word])
            if last < 65535:
                dictionary[word + x] = last
                last += 1
            word = x
    if word:
        encoded.append(dictionary[word])
    return encoded


def decoding(text):
    decoded = []
    dictionary = {x:x for x in characters}
    last = 1

    word = text[0]
    decoded.append(word)
    for x in text[1:]:
        if x in dictionary:
            entry = dictionary[x]
        else:
            entry = word + word[0]
        decoded.append(entry)
        if last < 65535:
            dictionary[last] = word + entry[0]
            last += 1
        word = entry
    return "".join(decoded)

In [None]:
lzw_compression_time = []
lzw_decompression_time = []
lzw_compression_ratio = []
lzw_correctness = []

for file in files:
    f_content = read_file_to_string(file)
    t = perf_counter()
    compressed = encoding(f_content)
    t = perf_counter() - t
    lzw_compression_time.append(t)

    write_lzw_code_to_binary("compressed.bin", compressed)
    to_decode = read_binary_file_to_lzw_decoding("compressed.bin")
    t = perf_counter()
    decompressed = decoding(to_decode)
    t = perf_counter() - t
    lzw_decompression_time.append(t)
    lzw_correctness.append(decompressed == f_content)
    lzw_compression_ratio.append(1 - (getsize('compressed.bin') / getsize(file)))

In [205]:
data = {
    'File Name': files,
    'Compression time': np.round(np.array(lzw_compression_time), 2),
    'Decompression time': np.round(np.array(lzw_decompression_time), 2),
    'Compression ratio': np.round(np.array(lzw_compression_ratio), 2),
    'Check correctness': lzw_correctness
}
df = pd.DataFrame(data)
display(df)

Unnamed: 0,File Name,Compression time,Decompression time,Compression ratio,Check correctness
0,Gutenberg_files/book.txt,0.0,0.0,-0.31,True
1,Gutenberg_files/mickiewicz.txt,0.01,0.01,0.19,True
2,Gutenberg_files/moby_dick.txt,1.89,0.15,0.56,True
3,Gutenberg_files/tom_sawyer.txt,0.07,0.02,0.51,True
4,random_files/file_size_1000kb.txt,0.58,0.27,-0.06,True
5,random_files/file_size_100kb.txt,0.08,0.03,-0.19,True
6,random_files/file_size_10kb.txt,0.0,0.0,-0.71,True
7,random_files/file_size_1kb.txt,0.0,0.0,-0.38,True
8,Linux_kernel/makefile.txt,0.0,0.0,-0.31,True
9,Linux_kernel/merged.txt,0.51,0.08,0.71,True


Algorytm lzw zyskuje, gdy w pliku pojawia się dużo powtarzających się wzorów. Możliwe jest, że uzyska ujemny współczynnik kompresji, gdyż zapisuje otrzymane kody na dwóch bajtach.
Widać, że jest nieskuteczny dla małych plików i plików losowych, za to bardzo wysoki współćzynnik kompresji uzyskały pliku z jądra linuxa.

#### Porównanie z huffmanem

In [207]:
count = 0
for x, y, z, file_name in zip(dynamic_tree_compression_ratio, static_tree_compression_ratio, lzw_compression_ratio, files):
    if z == max(x, y, z):
        count += 1
        print(file_name)
print(count)

Gutenberg_files/moby_dick.txt
Gutenberg_files/tom_sawyer.txt
Linux_kernel/merged.txt
Linux_kernel/workqueue.txt
4


Najlepszy okazał się łącznie dla czterech plików(duże nielosowe pliki), a był gorszy dla plików poniżej 10kb i wszystkich losowych