# Pretrained models quantization

## Load and quantize

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-sk")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-sk")

In [2]:
tokenizer

PreTrainedTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-sk', vocab_size=60025, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [3]:
type(tokenizer)

transformers.models.marian.tokenization_marian.MarianTokenizer

In [4]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(60025, 512, padding_idx=60024)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(60025, 512, padding_idx=60024)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [5]:
type(model)

transformers.models.marian.modeling_marian.MarianMTModel

In [6]:
sample_text = "Hello, my name is Milanko, and eating potatoes is my hobby."

batch = tokenizer([sample_text], return_tensors="pt")
gen = model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)

['Ahoj, volám sa Milanko, a jesť zemiaky je môj koníček.']

In [7]:
%timeit -n 10 model.generate(**batch)

508 ms ± 60.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
import torch

torch.backends.quantized.engine = 'qnnpack'

quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
print(quantized_model)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(60025, 512, padding_idx=60024)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(60025, 512, padding_idx=60024)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (q_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (out_proj): DynamicQuantizedLinear(in_features=512, out_features=512, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (



In [9]:
batch = tokenizer([sample_text], return_tensors="pt")
gen = quantized_model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)

['Zdravím, volám sa Milanko, a jesť zemiaky je môj koníček.']

In [11]:
%timeit -n 10 quantized_model.generate(**batch)

268 ms ± 3.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
sample_text = """I was listenin' to the ocean.
I saw a face in the sand."""

batch = tokenizer([sample_text], return_tensors="pt")
gen = quantized_model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)

['Počúval som oceán, videl som tvár v piesku.']

In [18]:
sample_text = """I was listenin' to the ocean.
I saw a face in the sand."""

batch = tokenizer([sample_text], return_tensors="pt")
gen = model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)

['Počúval som oceán, videl som tvár v piesku.']

## Size comparison

In [20]:
import os

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 301.913857
Size (MB): 200.839481


## Eval full-precision/quantized model

In [None]:
# load dataset of pretrained model


In [None]:
from transformers import Trainer

full_trainer = Trainer(model= model, eval_dataset = )

opus nlp moses link - OpenSubtitles v2018

https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-sk.txt.zip

opus nlp tmx link - OpenSubtitles v2018

https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/en-sk.tmx.gz

opus tmx link - wikimedia v20210402	

https://object.pouta.csc.fi/OPUS-wikimedia/v20210402/tmx/en-sk.tmx.gz

In [17]:
%%bash
cd ./datasets/
gzip -d en-sk.tmx.gz
unzip en-sk.txt.zip
cd ..

Archive:  en-sk.txt.zip
  inflating: OpenSubtitles.en-sk.en  
  inflating: OpenSubtitles.en-sk.sk  
  inflating: OpenSubtitles.en-sk.ids  
  inflating: README                  


gzip: can't stat: en-sk.tmx.gz (en-sk.tmx.gz.gz): No such file or directory


and so on...
preprocessing, loading ...

### Maybe using datasets library from huggingface is enough...

In [5]:
from datasets import load_dataset

dataset = load_dataset("open_subtitles", lang1="en", lang2="sk")

Using custom data configuration en-sk-lang1=en,lang2=sk
Reusing dataset open_subtitles (/Users/marek/.cache/huggingface/datasets/open_subtitles/en-sk-lang1=en,lang2=sk/0.0.0/c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'meta', 'translation'],
        num_rows: 8850871
    })
})

In [15]:
dataset['train'][0]

{'id': '0',
 'meta': {'year': 0,
  'imdbId': 3245996,
  'subtitleId': {'en': 5698460, 'sk': 5232687},
  'sentenceIds': {'en': [1], 'sk': [1]}},
 'translation': {'en': 'Subtitles by DramaFever',
  'sk': 'Subtitles by DramaFever'}}

In [17]:
dataset['train'][1]

{'id': '1',
 'meta': {'year': 0,
  'imdbId': 3245996,
  'subtitleId': {'en': 5698460, 'sk': 5232687},
  'sentenceIds': {'en': [2], 'sk': [2]}},
 'translation': {'en': 'Episode 2.', 'sk': 'Epizóda 2'}}