In [1]:
# Release filename
filename = "bible2.txt"

# Test File
filename = "test.txt"

In [2]:
def LZW_modified_compress(filename):
    """Compress data from a file byte by byte using the LZW compression algorithm.

    Args:
        filename (str): The path to the input file to be compressed.

    Returns:
        None: The compressed data is written to a new file with a ".lzw2" extension.

    The LZW compression algorithm reads data from the specified file byte by byte, building a
    dictionary of byte sequences encountered. When compressing, it replaces repeated sequences
    of bytes with references to the dictionary entries, thereby reducing the size of the data.

    The compression process stops adding new entries to the dictionary if the size of the
    dictionary reaches the specified limit of 2 ** 16 entries.

    The compressed data is written to a new file with the same name as the input file but with
    a ".lzw2" extension.

    Example:
        LZW_modified_compress("input_file.txt")

    """
    # Build the dictionary.
    dict_size_limit = 2 ** 16
    dict_size = 256
    dictionary = {bytes([i]): i for i in range(dict_size)}

    # Initialize bit length and maximum code size
    bit_length = 9
    max_code_size = 2 ** bit_length

    w = b""
    result = []

    with open(filename, 'rb') as file:
        while byte := file.read(1):
            wc = w + byte
            if wc in dictionary:
                w = wc
            else:
                result.append(dictionary[w])
                # Add wc to the dictionary if size limit is not reached.
                if dict_size < dict_size_limit:
                    dictionary[wc] = dict_size
                    dict_size += 1
                    # Increase bit length if necessary
                    if dict_size >= max_code_size:
                        bit_length += 1
                        max_code_size *= 2
                w = byte

    # Output the code for w.
    if w:
        result.append(dictionary[w])

    # Convert the result list to bytes
    compressed_data = bytearray()
    bit_buffer = 0
    bit_buffer_size = 0

    for code in result:
        bit_buffer |= code << bit_buffer_size
        bit_buffer_size += bit_length

        while bit_buffer_size >= 8:
            compressed_data.append(bit_buffer & 0xFF)
            bit_buffer >>= 8
            bit_buffer_size -= 8

    if bit_buffer_size > 0:
        compressed_data.append(bit_buffer & ((1 << bit_buffer_size) - 1))

    # Write compressed data to a new file
    with open(filename + ".lzw2", "wb") as compressed_file:
        compressed_file.write(compressed_data)

LZW_modified_compress(filename)

In [3]:
def LZW_modified_expand(filename):
    """Decompress data from a file compressed with LZW_modified_compress function.

    Args:
        filename (str): The path to the compressed file with a ".lzw2" extension.

    Returns:
        None: The decompressed data is written to a new file with the original filename.

    Example:
        LZW_modified_expand("input_file.txt.lzw2")

    """
    # Build the dictionary.
    dict_size_limit = 2 ** 16
    dict_size = 256
    dictionary = {i: bytes([i]) for i in range(dict_size)}

    # Initialize bit length and maximum code size
    bit_length = 9
    max_code_size = 2 ** bit_length

    # Read compressed data from the file
    with open(filename, 'rb') as file:
        compressed_data = file.read()

    # Initialize variables
    result = bytearray()
    bit_buffer = 0
    bit_buffer_size = 0
    current_code = None
    next_code = 256

    for byte in compressed_data:
        bit_buffer |= byte << bit_buffer_size
        bit_buffer_size += 8

        while bit_buffer_size >= bit_length:
            code = bit_buffer & ((1 << bit_length) - 1)
            bit_buffer >>= bit_length
            bit_buffer_size -= bit_length

            if current_code is None:
                current_code = code
                result.extend(dictionary[code])
            elif code == dict_size:
                entry = dictionary[current_code] + bytes([dictionary[current_code][0]])
                result.extend(entry)
                dictionary[next_code] = entry
                next_code += 1
                current_code = None
            else:
                if code in dictionary:
                    entry = dictionary[code]
                elif code == next_code:
                    entry = dictionary[current_code] + bytes([dictionary[current_code][0]])
                else:
                    raise ValueError("Invalid compressed data")

                result.extend(entry)
                dictionary[next_code] = dictionary[current_code] + bytes([entry[0]])
                next_code += 1
                current_code = code

            # Check if dictionary size exceeds limit, increase bit length if necessary
            if next_code >= max_code_size:
                bit_length += 1
                max_code_size *= 2

    # Write decompressed data to a new file
    output_filename = filename[:-5]+".2M"  # Remove the ".lzw2" extension
    with open(output_filename, "wb") as decompressed_file:
        decompressed_file.write(result)

LZW_modified_expand(filename+".lzw2")


In [4]:
# Test Cell

# Define color escape codes
class styles:
    RESET = '\033[0m'
    RED = '\033[91m'
    GREEN = '\033[92m'
    BOLD = '\033[1m'

def compare_files(file1, file2):
    with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
        while True:
            byte1 = f1.read(1)
            byte2 = f2.read(1)
            if byte1 != byte2:
                print(styles.RED + f"* Files {file1} and {file2} are not identical." + styles.RESET)
                return False
            if not byte1:  # Reached end of both files
                break
    print(styles.GREEN + f"* Files {file1} and {file2} are identical." + styles.RESET)
    return True

import glob

# Define filenames to be tested
test_filenames = glob.glob("test/*.txt")

# Perform tests for each filename
for i, filename in enumerate(test_filenames, start=1):
    print(styles.BOLD + f"Test {i}: " + styles.RESET)
    LZW_modified_compress(filename)
    LZW_modified_expand(filename + ".lzw2")
    result = compare_files(filename, filename + ".2M")
    if result:
        print(styles.GREEN + f"* Passed - Compression and decompression successful." + styles.RESET)
    else:
        print(styles.RED + f"* FAILED - Compression and decompression unsuccessful." + styles.RESET)


[1mTest 1: [0m


ValueError: Invalid compressed data