In [2]:
import sys

In [92]:
class LZW:

    def readfile(self, filename):
        try:
            f = open(filename, 'r')
            data = f.readlines()
            f.close()
        except UnicodeDecodeError:
            # 입력 스트림과 출력 스트림을 연다
            input = open(filename, "rt", encoding="utf-16")
            data = ''

            # 유니코드 데이터 조각들을 스트리밍한다
            with input:
                while True:
                    # 데이터 조각을 읽고
                    chunk = input.read(4096)
                    if not chunk:
                        break
                    # 수직 탭을 삭제한다
                    chunk = chunk.replace("\u000B", "")
                    # 데이터 조각을 쓴다
                    data += chunk

        return data

    def compress(self, origin_file, compressed_file):
        import struct 

        data = ''.join(self.readfile(origin_file))
        encoded_data = self.encoding(data)

        binfile = open(compressed_file, 'wb')
        for ch in encoded_data:
            binfile.write(struct.pack('I', ch))
        binfile.close()

    def decompress(self, compressed_file, decompressed_file):
        import struct 
        binfile = open(compressed_file, 'rb')
        encoded_data = []

        while True:
            binary = binfile.read(4)
            if len(binary) == 0: break
            encoded_data.append(struct.unpack('I', binary)[0])
        
        decoded_data = self.decoding(encoded_data)

        f = open(decompressed_file, 'w')
        f.write(decoded_data)
        f.close()

    def encoding(self, data):
        dictionary = {chr(i): i for i in range(256)}
        result = []
        wc = ''

        for c in data:
            wc += c
            if wc not in dictionary:
                result.append(dictionary[wc[:-1]])
                dictionary[wc] = len(dictionary)
                wc = c

        if wc != '': result.append(dictionary[wc])

        return result


    def decoding(self, data):
        dictionary = {i: chr(i) for i in range(256)}
        result = w = dictionary[data.pop(0)]

        for k in data:
            entry = dictionary[k] if k in dictionary else w + w[0]
            result += entry    
            dictionary[len(dictionary)] = w + entry[0]
            w = entry

        return result

In [93]:
lzw = LZW()
lzw.compress('infile.txt', 'compress_lzw.lz')
lzw.decompress('compress_lzw.lz', 'restore_lzw.txt')

In [94]:
import os
print('File name: infile.txt, \nOrigin file size: %sByte, \nCompressed size: %sByte, \nCompression ratio: %f%%' % 
('{:,}'.format(os.path.getsize('./infile.txt')), '{:,}'.format(os.path.getsize('./compress_lzw.lz')), 
 os.path.getsize('./compress_lzw.lz')/os.path.getsize('./infile.txt')*100))

File name: infile.txt, 
Origin file size: 1,555,051Byte, 
Compressed size: 1,201,920Byte, 
Compression ratio: 77.291356%


In [41]:
data = ['ABBCBCABABCAABCAAB', 'BABAABRRRA', 'AAAAAAAAA']

for origin_text in data:
    lzw = LZW()
    encoded_text = lzw.encoding(origin_text)
    decoded_text = lzw.decoding(encoded_text)
    print(origin_text, encoded_text, decoded_text, origin_text == decoded_text)
    print('--------')

ABBCBCABABCAABCAAB [66, 66, 67, 258, 256, 256, 67, 65, 262, 264, 66] ABBCBCABABCAABCAAB True
--------
BABAABRRRA [65, 256, 257, 82, 260, 65] BABAABRRRA True
--------
AAAAAAAAA [256, 257, 257] AAAAAAAAA True
--------


In [87]:
lzw = LZW()
f = open('./infile.txt', 'r')
origin_text = f.read()
f.close()

encoded_text = lzw.encoding(origin_text)
decoded_text = lzw.decoding(encoded_text)
print(origin_text == decoded_text)

True


In [89]:
len(encoded_text)

300479

In [51]:
for i in range(2, 4):
    lz78 = LZW()
    lz78.compress('infile%d.txt' % i, 'compress%d.lz' % i)
    lz78.decompress('compress%d.lz' % i, 'restore%d.txt' % i)
    print('Test %dbyte' % i)
    print('File name: infile%d.txt, \nOrigin file size: %sByte, \nCompressed size: %sByte, \nCompression ratio: %f%%\n' % 
          (i, '{:,}'.format(os.path.getsize('./infile%d.txt' % i)), '{:,}'.format(os.path.getsize('./compress%d.lz' % i)), 
           os.path.getsize('./compress%d.lz' % i)/os.path.getsize('./infile%d.txt' % i)*100))

Test 2byte
File name: infile2.txt, 
Origin file size: 518Byte, 
Compressed size: 811Byte, 
Compression ratio: 156.563707%

Test 3byte
File name: infile3.txt, 
Origin file size: 2,452Byte, 
Compressed size: 3,065Byte, 
Compression ratio: 125.000000%



In [86]:
struct.unpack('H', b'\x00\x04')

(1024,)