<a href="https://colab.research.google.com/github/michaelmherrera/cs224-final-proj-compressor/blob/main/NeuralCompressorEvals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content
!git clone https://github.com/michaelmherrera/cs224-final-proj-compressor.git

/content
Cloning into 'cs224-final-proj-compressor'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 36 (delta 14), reused 7 (delta 2), pack-reused 0[K
Unpacking objects: 100% (36/36), done.


In [None]:
!pip install transformers[sentencepiece] datasets

Collecting transformers[sentencepiece]
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[K     |████████████████████████████████| 312 kB 5.6 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 32.3 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 43.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.

In [None]:
%cd cs224-final-proj-compressor/

/content/cs224-final-proj-compressor


In [None]:
# Load datasets
from datasets import Dataset
import json
import gdown

url = 'https://drive.google.com/uc?id=1sAgDtEj-UjJECfTF6xfiWFk7lrTX7yoV'
filename = "articles_1000.json"
gdown.download(url, filename, quiet=False)
with open(filename, 'r') as f:
    data = json.load(f)
dataset1000 = Dataset.from_dict(data)
dataset300 = dataset1000.select(range(300))
dataset500 = dataset1000.select(range(500))

Downloading...
From: https://drive.google.com/uc?id=1sAgDtEj-UjJECfTF6xfiWFk7lrTX7yoV
To: /content/cs224-final-proj-compressor/articles_1000.json
100%|██████████| 3.77M/3.77M [00:00<00:00, 179MB/s]


In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformer_compressor import TransformerCompressor
from transformers import PreTrainedTokenizerFast
import numpy as np
import time
import gzip
from utils import batched_encode, batched_binarize



In [None]:
# Transformer-based compression

def transformer_compress(compressor: TransformerCompressor, tokenizer: PreTrainedTokenizerFast, dataset, batch_size, slice_len):
    tick = time.time()
    data_by_buckets, encodings_by_buckets = batched_encode(compressor, tokenizer, dataset, batch_size, slice_len)
    binarized_by_bucket = batched_binarize(data_by_buckets, encodings_by_buckets)
    tock = time.time()
    transformer_time = tock - tick
    return transformer_time, data_by_buckets, binarized_by_bucket

# Gzip-based compression
def gzip_compress(data_by_buckets):
    tick = time.time()
    gzipped_by_bucket = {}
    for bucket in sorted(data_by_buckets.keys()):
        msgs = data_by_buckets[bucket]['article']
        gzipped = []
        for msg in msgs:
            res = gzip.compress(bytes(msg, encoding='utf-8'), compresslevel=9)
            gzipped.append(res)
        gzipped_by_bucket[bucket] = gzipped
    tock = time.time()
    gzip_time = tock - tick
    return gzip_time, gzipped_by_bucket

# Evaluation

def evaluate(compressor: TransformerCompressor, tokenizer: PreTrainedTokenizerFast, dataset, batch_size, slice_len):
    
    transformer_time, data_by_buckets, binarized_by_bucket = transformer_compress(compressor, tokenizer, dataset, batch_size, slice_len)
    gzip_time, gzipped_by_bucket = gzip_compress(data_by_buckets)


    og = []
    gz = []
    trans = []

    compression_stats_by_bucket = {}
    for bucket in sorted(data_by_buckets.keys()):
        og = og + list(map(lambda msg: len(bytes(msg, encoding='utf-8')), data_by_buckets[bucket]['article']))
        gz = gz + list(map(len, gzipped_by_bucket[bucket]))
        trans = trans + list(map(len, binarized_by_bucket[bucket]))
    return {'dataset_size': len(dataset), 'batch_size': batch_size, 'slice_len': slice_len, 
            'trans_time': transformer_time, 
            'gz_time': gzip_time, 'sizes': {'og': og, 'gz': gz, 'trans': trans}}



In [None]:
dataset100 = dataset1000.select(range(30))
dataset20 = dataset1000.select(range(20))

tiny_subset = dataset20.map(lambda elem: {'tiny_article': elem['article'][0:100]})
tiny_subset = tiny_subset.remove_columns('article')
tiny_subset = tiny_subset.add_column('article', tiny_subset['tiny_article']) 
tiny_subset = tiny_subset.remove_columns('tiny_article')
tiny_20 = tiny_subset
tiny_10 = tiny_subset.select(range(10))

0ex [00:00, ?ex/s]

In [None]:
len(tiny_10)

10

In [None]:
from os.path import exists


path = '../output/n{}_batch{}_slice{}.json'


def do_experiment(compressor, tokenizer, dataset, batch_size, slice_len, errors):
  print(f'DatasetSize {len(dataset)} | BatchSize: {batch_size}| SliceLen: {slice_len}')

  file_path = path.format(len(dataset), batch_size, slice_len)
  # Skip if experiment already done or caused error
  if f'n{len(dataset)}_batch{batch_size}_slice{slice_len}' in errors:
    print('Previously errored. Skipping...')
  if exists(file_path):
    print('Skipping...')
    return

  results = evaluate(compressor, tokenizer, dataset, batch_size, slice_len)
  with open(file_path, 'w') as f:
    json.dump(results, f)



In [None]:
from os import mkdir
try:
  mkdir('../output')
except FileExistsError:
  pass

# Distilgpt2 is twice as fast and has comparable compression performance
# Bart, BigBird, Reformer and XLNet all have far worst performance
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to('cuda')
tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token
VOCAB_SIZE = tokenizer.vocab_size
PAD_TOKEN = tokenizer.pad_token_id
EOS_TOKEN = tokenizer.eos_token_id
compressor = TransformerCompressor(model, VOCAB_SIZE, PAD_TOKEN, EOS_TOKEN)

batch_size = 32
slices_lengths = [32,64,128,256,512]


errors = ['n10_batch32_slice256','n10_batch32_slice512','n20_batch32_slice256','n20_batch32_slice512']


for dataset in [tiny_10, tiny_20]:
    for slice_len in slices_lengths:
      try:
        do_experiment(compressor, tokenizer, dataset, batch_size, slice_len, errors)
      except RuntimeError: #Out of memory
        print(f'n{len(dataset)}_batch{batch_size}_slice{slice_len} failed')
        errors.append(f'n{len(dataset)}_batch{batch_size}_slice{slice_len}')
        

print(errors)

DatasetSize 10 | BatchSize: 32| SliceLen: 32


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetSize 10 | BatchSize: 32| SliceLen: 64


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetSize 10 | BatchSize: 32| SliceLen: 128


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetSize 10 | BatchSize: 32| SliceLen: 256
Previously errored. Skipping...


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

n10_batch32_slice256 failed
DatasetSize 10 | BatchSize: 32| SliceLen: 512
Previously errored. Skipping...


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

n10_batch32_slice512 failed
DatasetSize 20 | BatchSize: 32| SliceLen: 32
Skipping...
DatasetSize 20 | BatchSize: 32| SliceLen: 64
Skipping...
DatasetSize 20 | BatchSize: 32| SliceLen: 128
Skipping...
DatasetSize 20 | BatchSize: 32| SliceLen: 256
Previously errored. Skipping...


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

n20_batch32_slice256 failed
DatasetSize 20 | BatchSize: 32| SliceLen: 512
Previously errored. Skipping...


0ex [00:00, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

KeyboardInterrupt: ignored

In [None]:
import shutil
shutil.make_archive('utputs.zip', 'zip', '../output')

'/content/cs224-final-proj-compressor/utputs.zip.zip'

In [None]:
tiny_10

Dataset({
    features: ['article'],
    num_rows: 10
})