In [657]:
import os
import struct
import numpy as np

In [813]:
class LZ78_switch:
    
    def readfile(self, filename):
        try:
            f = open(filename, 'r')
            data = f.readlines()
            f.close()
        except UnicodeDecodeError:
            # 입력 스트림과 출력 스트림을 연다
            input = open(filename, "rt", encoding="utf-16")
            data = ''

            # 유니코드 데이터 조각들을 스트리밍한다
            with input:
                while True:
                    # 데이터 조각을 읽고
                    chunk = input.read(4096)
                    if not chunk:
                        break
                    # 수직 탭을 삭제한다
                    chunk = chunk.replace("\u000B", "")
                    # 데이터 조각을 쓴다
                    data += chunk

        return data
    
    def compress(self, origin_file, compressed_file):
        import struct 
        
        data = ''.join(self.readfile(origin_file))
        encoded_data = self.encoding(data)
                
        binfile = open(compressed_file, 'wb')
        for i, (idx, ch) in enumerate(encoded_data):
            if len(ch) == 0:
                break
                                        
            if i <= 255:
                binsize = 2
                data = struct.pack('Bc', idx, ch.encode())
                binfile.write(data) 
            elif i <= 65535:
                binsize = 3
                data = struct.pack('Hc', idx, ch.encode())
                binfile.write(data) 
            elif i <= 16777215:
                binsize = 4
                data = struct.pack('Ic', idx, ch.encode())
                binfile.write(b''.join([data[0:3], data[4:]])) 
            else:
                binsize = 5
                data = struct.pack('Ic', idx, ch.encode())
                binfile.write(data)            
            
        binfile.close()
        
    def decompress(self, compressed_file, decompressed_file):
        import struct 
        binfile = open(compressed_file, 'rb')
        bintype = {2: 'Bc', 3: 'Hc', 4: 'Ic', 5: 'Ic'}
               
        encoded_data = []
        seq = 0
        
        while True:
            if seq <= 255:
                binsize = 2
                binary = binfile.read(binsize)
                if binary == b'': break 
                encoded_data.append(struct.unpack(bintype[binsize], binary))
            elif seq <= 65535:
                binsize = 3
                binary = binfile.read(binsize)
                if binary == b'': break 
                encoded_data.append(struct.unpack(bintype[binsize], binary))
            elif seq <= 16777215:
                binsize = 4
                binary = binfile.read(binsize)
                if binary == b'': break 
                encoded_data.append(struct.unpack(bintype[binsize], b''.join([binary[0:3], b'\x00', binary[3:]])))
            else:
                binsize = 5
                binary = binfile.read(binsize)
                if binary == b'': break 
                encoded_data.append(struct.unpack(bintype[binsize], binary))
                
            seq += 1
        
        encoded_data = [(d[0], d[1].decode()) for d in encoded_data]
        decoded_data = self.decoding(encoded_data)
        
        f = open(decompressed_file, 'w')
        f.write(decoded_data)
        f.close()
        
    def encoding(self, data):
        import collections
        encode_dict = collections.OrderedDict()
        out = []
        out2 = []
        key = ''
        
        for i, c in enumerate(data):
            key += c
            if key not in encode_dict:
                out.append((encode_dict[key[:-1]] if len(key) > 1 else 0, c))
                encode_dict[key] = len(encode_dict)+1
                key = ''
                
        if key != '': out.append((encode_dict[key], ''))

        return out
    
    def decoding(self, data):
        d = []
        p = ''

        for (w, c) in data: d.append(c if w == 0 else d[w-1] + c)

        return ''.join(d)

In [817]:
class LZW:

    def readfile(self, filename):
        try:
            f = open(filename, 'r')
            data = f.readlines()
            f.close()
        except UnicodeDecodeError:
            # 입력 스트림과 출력 스트림을 연다
            input = open(filename, "rt", encoding="utf-16")
            data = ''

            # 유니코드 데이터 조각들을 스트리밍한다
            with input:
                while True:
                    # 데이터 조각을 읽고
                    chunk = input.read(4096)
                    if not chunk:
                        break
                    # 수직 탭을 삭제한다
                    chunk = chunk.replace("\u000B", "")
                    # 데이터 조각을 쓴다
                    data += chunk

        return data

    def compress(self, origin_file, compressed_file):
        import struct 

        data = ''.join(self.readfile(origin_file))
        encoded_data = self.encoding(data)

        binfile = open(compressed_file, 'wb')

        for ch in encoded_data:
            if ch <= 255:
                data = struct.pack('Bx', ch)
                binfile.write(data) 
            elif ch <= 65535:
                data = struct.pack('Hx', ch)
                binfile.write(data) 
            elif ch <= 16777215:
                data = struct.pack('Ix', ch)
                binfile.write(data[0:4])
            else:
                data = struct.pack('Ix', ch)
                binfile.write(data)            

        binfile.close()

    def decompress(self, compressed_file, decompressed_file):
        import struct 
        binfile = open(compressed_file, 'rb')
        bintype = {1: 'B', 2: 'H', 3: 'I', 4: 'I'}
        encoded_text = b''
        encoded_data = []

        while True:
            binary = binfile.read(1)
            if len(binary) == 0: break
            if struct.unpack('B', binary)[0] == 0 and len(encoded_text) > 0:
                if len(encoded_text) == 3:
                    print(encoded_text)
                encoded_data.append(struct.unpack(bintype[len(encoded_text)], encoded_text + binary if len(encoded_text) == 3 else encoded_text)[0])
                encoded_text = b''
            else:
                encoded_text += binary

        decoded_data = self.decoding(encoded_data)

        f = open(decompressed_file, 'w')
        f.write(decoded_data)
        f.close()

    def encoding(self, data):
        dictionary = {chr(i): i for i in range(256)}
        result = []
        wc = ''

        for c in data:
            wc += c
            if wc not in dictionary:
                result.append(dictionary[wc[:-1]])
                dictionary[wc] = len(dictionary)
                wc = c

        if wc != '': result.append(dictionary[wc])

        return result


    def decoding(self, data):
        dictionary = {i: chr(i) for i in range(256)}
        result = w = dictionary[data.pop(0)]

        for k in data:
            entry = dictionary[k] if k in dictionary else w + w[0]
            result += entry    
            dictionary[len(dictionary)] = w + entry[0]
            w = entry

        return result

In [None]:
lzw = LZW()
lzw.compress('infile.txt', 'compress.lz')
lzw.decompress('compress.lz', 'restore.txt')

In [820]:
text = 'By default, the result of packing a given C struct includes pad bytes in order to maintain proper alignment for the C types involved; similarly, alignment is taken into account when unpacking. This behavior is chosen so that the bytes of a packed struct correspond exactly to the layout in memory of the corresponding C struct. To handle platform-independent data formats or omit implicit pad bytes, use standard size and alignment instead of native size and alignment: see Byte Order, Size, and Alignment for details.'
lzw = LZW()
b = lzw.encoding(text)

In [823]:
lzw.compress('infile2.txt', 'compress_lzw.lz')
f = open('./compress_lzw.lz', 'rb')
a = f.read()
f.close()

In [842]:
bintype = {1: 'B', 2: 'H', 3: 'I', 4: 'I'}
c = [struct.unpack(bintype[len(by)], by)[0] for by in a.split(b'\x00') if len(by) > 0]

In [843]:
import pandas as pd

In [845]:
len(b), len(c)

(325, 325)