# LZW

This notebook provides a visually aided demonstration of the compression ratio of text files compressed using the [Lempel-Ziv-Welch](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch) coding technique. The [lzw](https://github.com/pytholic97/LZW-Text-File-Compression) python package is used to compress the files. 

The text files used for compression have been generated by sampling characters in the ascii range 0 through 127 from 3 different probability distributions:

* Poisson
* Gaussian
* Uniform

The statistical parameters of the distributions - such as mean and std deviation for Gaussian pdf - have been suitably varied to obtain a general idea of how the LZW technique depends not only on the distributions themselves but also their individual statistics.

In [2]:
! which python3

/home/coesip/7069/lzw/bin/python3


In [3]:
import numpy as np
import os
from sklearn.externals import joblib
import lzw
import time
from datetime import timedelta
from IPython.display import clear_output

In [40]:
# Dictionary for saving results of compression and decompression

res_dict = dict()
res_dict['pdf'], res_dict['size'], res_dict['comp_size'] = [], [], []
res_dict['comp_time'], res_dict['decomp_time'] = [], []

In [41]:
pdfff = 'Poisson'
os.chdir('/home/coesip/7069/files/' + pdfff + '1/original')
to_comp = [_ for _ in os.listdir() if int(_.split('.')[0]) > 0 and int(_.split('.')[0]) <= 10000000]
#to_comp = ['10000000.txt','36000000.txt','40000000.txt']
to_comp

['2000000.txt',
 '1000000.txt',
 '4000000.txt',
 '10000000.txt',
 '8000000.txt',
 '500000.txt',
 '6000000.txt']

In [42]:
pdff = 'Poisson'
os.chdir('/home/coesip/7069/files/' + pdff + '1/original')

comp_path = '/home/coesip/7069/files/' + pdff + '1/compressed/'
decomp_path = '/home/coesip/7069/files/' + pdff + '1/decompressed/'
file_path = '/home/coesip/7069/files/' + pdff + '1/original/'

from lzw.Compress import compress as cmp
from lzw.Decompress import decompress as dec

for file in to_comp:
    res_dict['pdf'].append(pdff)
    #res_dict['size'].append(file.split('.')[0])
    res_dict['size'].append(os.path.getsize(file_path+file))
    
    clear_output()
    print('Processing '+file)
    c = cmp(file_path+file, comp_path, encoding='ascii_127', verbose=1, limit=50000000)
    start_time = time.monotonic()
    c.encode()
    end_time = time.monotonic()
    clear_output()
    print(file + ' compressed. Time taken: '+ str(timedelta(seconds=end_time - start_time)))
    
    res_dict['comp_time'].append(end_time - start_time)
    res_dict['comp_size'].append(os.path.getsize(comp_path+file.split('.')[0]+'_compressed.txt'))
    
    
    d = dec(comp_path+file.split('.')[0]+'_compressed.txt', decomp_path, encoding='ascii_127', verbose=1, limit=50000000)
    start_time = time.monotonic()
    d.decode()
    end_time = time.monotonic()
    res_dict['decomp_time'].append(end_time - start_time)
    clear_output()
    print(file + ' decompressed. Time taken: '+str(timedelta(seconds=end_time - start_time)))

    if abs(os.path.getsize(decomp_path+file.split('.')[0]+'_compressed_decompressed.txt')-os.path.getsize(file_path+file)) > 10:
        print("error in file " + file)
        break

6000000.txt decompressed. Time taken: 0:00:10.968440


In [44]:
# Saving results object
#res_dict['size']    
res_dict['size']
joblib.dump(res_dict, '/home/coesip/7069/LZW-Compression/test_objects/test1/' + pdff + '1/pois_all_le10MB.sav')

['/home/coesip/7069/LZW-Compression/test_objects/test1/Poisson1/pois_all_le10MB.sav']