# Compression performances

In [1]:
print (''.join(['{0:08b}'.format(ord(l)) 
                for l in "Hello World"]))

0100100001100101011011000110110001101111001000000101011101101111011100100110110001100100


In [2]:
print (' '.join(['{0:08b}'.format(ord(l))
                 for l in "ACTG"]))

01000001 01000011 01010100 01000111


In [3]:
from  urllib import request
import zlib
from random import randint

url = 'https://github.com/lmassaron/datasets/releases/'
url += 'download/1.0/1661-0.txt'
response = request.urlopen(url)
sh = response.read().decode('utf-8')[932:]
sh_length = len(sh)
rnd = ''.join([chr(randint(0,126)) for k in 
               range(sh_length)])

def zipped(text):
    return len(zlib.compress(text.encode("utf-8'")))

print ("Original size for both texts: %s characters" % 
       sh_length)
print ("The Adventures of Sherlock Holmes to %s" % 
       zipped(sh))
print ("Random file to %s " % zipped(rnd)) 

Original size for both texts: 592905 characters
The Adventures of Sherlock Holmes to 227478
Random file to 519653 


# LZW

In [4]:
def lzw_compress(text):
    dictionary = {chr(k): k for k in range(256)}
    encoded = list()
    s = text[0]
    for c in text[1:]:
        if s+c in dictionary:
            s = s+c
        else:
            print ('> %s' %s)
            encoded.append(dictionary[s])
            print ('found: %s compressed as %s' % 
                   (s,dictionary[s]))
            dictionary[s+c] = max(dictionary.values()) + 1
            print ('New sequence %s indexed as %s' % 
                   (s+c, dictionary[s+c]))
            s = c 
    encoded.append(dictionary[s])
    print ('found: %s compressed as %s' 
           %(s,dictionary[s]))
    return encoded

In [5]:
text = "ABABCABCABC"
compressed = lzw_compress(text)
print('\nCompressed: %s \n' % compressed)

> A
found: A compressed as 65
New sequence AB indexed as 256
> B
found: B compressed as 66
New sequence BA indexed as 257
> AB
found: AB compressed as 256
New sequence ABC indexed as 258
> C
found: C compressed as 67
New sequence CA indexed as 259
> ABC
found: ABC compressed as 258
New sequence ABCA indexed as 260
found: ABC compressed as 258

Compressed: [65, 66, 256, 67, 258, 258] 



In [6]:
def lzw_decompress(encoded):
    reverse_dictionary = {k:chr(k) for k in range(256)}
    current = encoded[0]
    output = reverse_dictionary[current]
    print ('Decompressed %s ' % output)
    print ('>%s' % output)
    for element in encoded[1:]:
        previous = current
        current = element
        if current in reverse_dictionary:
            s = reverse_dictionary[current]
            print ('Decompressed %s ' % s)
            output += s
            print ('>%s' % output)
            new_index = max(reverse_dictionary.keys()) + 1
            reverse_dictionary[new_index
            ] = reverse_dictionary[previous] + s[0]
            print ('New dictionary entry %s at index %s' % 
                    (reverse_dictionary[previous] + s[0], 
                     new_index))
        else:
            print ('Not found:',current,'Output:', 
                   reverse_dictionary[previous
                    ] + reverse_dictionary[previous][0])
            s = reverse_dictionary[previous
                    ] + reverse_dictionary[previous][0]
            print ('New dictionary entry %s at index %s' % 
                   (s, max(reverse_dictionary.keys())+1))
            reverse_dictionary[
                max(reverse_dictionary.keys())+1] = s
            print ('Decompressed %s' % s)
            output += s
            print ('>%s' % output)
    return output

In [7]:
print ('\ndecompressed string : %s' % 
       lzw_decompress(compressed))
print ('original string was : %s' % text)

Decompressed A 
>A
Decompressed B 
>AB
New dictionary entry AB at index 256
Decompressed AB 
>ABAB
New dictionary entry BA at index 257
Decompressed C 
>ABABC
New dictionary entry ABC at index 258
Decompressed ABC 
>ABABCABC
New dictionary entry CA at index 259
Decompressed ABC 
>ABABCABCABC
New dictionary entry ABCA at index 260

decompressed string : ABABCABCABC
original string was : ABABCABCABC
