In [11]:
import math
import os

In [12]:
def compress(data, max_dict_size):
    dictionary = {(i,): i for i in range(256)}
    next_code = 256

    compressed_data = []
    data = [char for char in data]

    current_sequence = (data[0],)
    for char in data[1:]:
        combined_sequence = (*current_sequence, char)
        if combined_sequence in dictionary:
            current_sequence = combined_sequence
        else:
            if next_code >= max_dict_size:
                compressed_data.append(dictionary[current_sequence])
                current_sequence = (char,)
                continue

            compressed_data.append(dictionary[current_sequence])
            dictionary[combined_sequence] = next_code
            next_code += 1
            current_sequence = (char,)

    compressed_data.append(dictionary[current_sequence])

    bit_length = math.ceil(math.log2(next_code))
    fixed_length_dictionary = {v: f'{v:0{bit_length}b}' for k, v in dictionary.items()}

    compressed_bitstring = ''.join(fixed_length_dictionary[char] for char in compressed_data)
    compressed_bitstring += '0' * (8 - len(compressed_bitstring) % 8)

    compressed_bytes = bytearray()
    for i in range(0, len(compressed_bitstring), 8):
        compressed_bytes.append(int(compressed_bitstring[i:i + 8], 2))

    return bytes(compressed_bytes), bit_length

In [13]:
def decompress(compressed_bytes, bit_length, max_dict_size):
    dictionary = {i: chr(i) for i in range(256)}
    next_code = 256
    decompressed_bitstring = ''.join(f'{byte:08b}' for byte in compressed_bytes)

    decompressed_data = bytearray()
    prev_code = ''

    sequence_length = bit_length

    codes = [int(decompressed_bitstring[i:i + sequence_length], 2) for i in range(0, len(decompressed_bitstring), sequence_length)]
    old_code = codes.pop(0)
    decompressed_data.append(ord(dictionary[old_code]))

    for code in codes:
        if code in dictionary:
            entry = dictionary[code]
        else:
            entry = dictionary[old_code] + prev_code

        for char in entry:
            decompressed_data.append(ord(char))
        prev_code = entry[0]

        if next_code >= max_dict_size:
            old_code = code
            continue

        dictionary[next_code] = dictionary[old_code] + prev_code
        next_code += 1
        old_code = code

    return decompressed_data

In [None]:
files = ['wiki_sample.txt', 'norm_wiki_sample.txt', 'lena.bmp']
size_info = {}
for file in files:
    print('Processing file:', file)
    filename, ext = file.split('.')
    max_dict_sizes = {'12': 2**12, '18': 2**18, 'max': 2**128}
    for variant, dict_size in max_dict_sizes.items():
        print(f'Variant with dictionary size 2^{variant}')
        with open(f'{filename}.{ext}', 'rb') as f:
            data = f.read()

        compressed_data, bit_length = compress(data, dict_size)
        print(f'Dictionary size 2^{bit_length}')

        with open(f'output/{filename}_{variant}.bin', 'wb') as f:
            f.write(compressed_data)

        with open(f'output/{filename}_{variant}.bin', 'rb') as f:
            data = f.read()

        decompressed_data = decompress(data, bit_length, dict_size)
        with open(f'proof/{filename}_{variant}.{ext}', 'wb') as f:
            f.write(decompressed_data)

        file_size = os.stat(f'output/{filename}_{variant}.bin').st_size
        size_info[f'output/{filename}_{variant}.bin'] = file_size

for key, val in size_info.items():
    print(f"Size of {key} is {val}")

Processing file: wiki_sample.txt
Variant with dictionary size 2^12
Dictionary size 2^12
Variant with dictionary size 2^18
Dictionary size 2^18
Variant with dictionary size 2^max
Dictionary size 2^21
Processing file: norm_wiki_sample.txt
Variant with dictionary size 2^12
Dictionary size 2^12
Variant with dictionary size 2^18
Dictionary size 2^18
Variant with dictionary size 2^max
Dictionary size 2^21
Processing file: lena.bmp
Variant with dictionary size 2^12
Dictionary size 2^12
Variant with dictionary size 2^18
Dictionary size 2^18
