# Statically Quantize Roberta

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from transformers import glue_compute_metrics
import sklearn
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os
from argparse import Namespace

In [3]:
from dynamic_quant_roberta import QuantRobertaForSequenceClassification
from transformers import RobertaForSequenceClassification, AutoTokenizer
from transformers.data.metrics import simple_accuracy

In [19]:
qmodel = QuantRobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')
model = RobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')

tokenizer = AutoTokenizer.from_pretrained('textattack/roberta-base-MRPC')

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing QuantRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoin

In [5]:
print('\n\n, Baseline Bert modules: \n',model)



, Baseline Bert modules: 
 QuantRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [None]:
print('\n\n, Baseline Bert modules: \n',qmodel)

In [6]:
from datasets import load_dataset
dataset = load_dataset('glue', 'mrpc', split='validation')

# dataset = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (/home/jjc/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [7]:
tokenizer.decode(tokenizer(dataset[0]['sentence1'], dataset[0]['sentence2'])['input_ids'])

'<s>He said the foodservice pie business doesn \'t fit the company\'s long-term growth strategy.</s></s>" The foodservice pie business does not fit our long-term growth strategy.</s>'

In [8]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'],truncation=True, padding='max_length')

dataset = dataset.map(encode, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
print(dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})


In [10]:
data = dataset
data

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})

In [11]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [12]:
dataloader = torch.utils.data.DataLoader(data, batch_size=32)


# 

In [21]:
def eval_model(model, dataloader):
    
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = 'cuda'
    model.eval()
    preds = None
    model = model.to(device=torch.device('cuda'))
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            tmp_eval_loss, logits = outputs[:2]
            loss = outputs[0]
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = batch['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
        if i % 10 == 0:
    #         print(f"loss: {loss}")
            pass

    preds = np.argmax(preds, axis=1)

    print(f'accuracy: {simple_accuracy(preds, out_label_ids)}')

In [22]:
torch.cuda.is_available()


True

In [23]:
eval_model(model, dataloader)
# os.environ['CUDA_LAUNCH_BLOCKING']='1'

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.59it/s]

accuracy: 0.9117647058823529





In [25]:
qmodel = qmodel.to(device=torch.device('cuda'))

In [26]:
eval_model(qmodel, dataloader)

  0%|                                                                                                                                                         | 0/13 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [27]:
torch.backends.quantized.engine = 'qnnpack'
# model = model.to(device=torch.device('cpu')

In [28]:
model = model.to(device=torch.device('cpu'))

dynamic_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
print(dynamic_model)
def eval_model(model, dataloader):
    
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = 'cpu'
    model.eval()
    preds = None
    model = model.to(device=torch.device('cpu'))
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            tmp_eval_loss, logits = outputs[:2]
            loss = outputs[0]
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = batch['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
        if i % 10 == 0:
    #         print(f"loss: {loss}")
            pass

    preds = np.argmax(preds, axis=1)

    print(f'accuracy: {simple_accuracy(preds, out_label_ids)}')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0



In [None]:
eval_model(dynamic_model, dataloader)

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(qmodel)
print_size_of_model(dynamic_model)

## PyTorch Inference Time and Throughput
1.Dynamic Model

In [29]:
import time

# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.
latency = []
dynamic_model = dynamic_model.to(device=torch.device('cpu'))
device = 'cpu' 
batch_size = 32 
dataloader = torch.utils.data.DataLoader(data, batch_size)            
with torch.no_grad():
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        start = time.time()
        outputs = dynamic_model(**batch)
        latency.append(time.time() - start)
print("PyTorch {} Total Inference time = {} ms".format(device, format(sum(latency) * 1000 , '.2f')))
print("PyTorch {} Throughput = {} PS".format(device, format((batch_size * (408/batch_size)) /sum(latency), '.2f')))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [02:23<00:00, 11.05s/it]

PyTorch cpu Total Inference time = 143495.01 ms
PyTorch cpu Throughput = 2.84 PS





2.Baseline Model

In [None]:
import time

# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.
latency = []
model = model.to(device=torch.device('cuda'))
device = 'cuda' 
batch_size = 32
dataloader = torch.utils.data.DataLoader(data, batch_size)
with torch.no_grad():
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        start = time.time()
        outputs = model(**batch)
        latency.append(time.time() - start)
print("PyTorch {} Total Inference time = {} ms".format(device, format(sum(latency) * 1000 , '.2f')))
print("PyTorch {} Throughput = {} QPS".format(device, format((batch_size * (408/batch_size)) /sum(latency)/1000, '.2f')))

In [None]:
from functools import partial

import torch
from transformers import BertForSequenceClassification, BertTokenizer

from ptflops import get_model_complexity_info

from transformers import RobertaForSequenceClassification, AutoTokenizer
from transformers.data.metrics import simple_accuracy
model = model.to(device=torch.device('cpu'))
def bert_input_constructor(input_shape, tokenizer):
    inp_seq = ""
    for _ in range(input_shape[1] - 2):  # there are two special tokens [CLS] and [SEP]
        inp_seq += tokenizer.pad_token  # let's use pad token to form a fake
    # sequence for subsequent flops calculation

    inputs = tokenizer([inp_seq] * input_shape[0], padding=True, truncation=True,
                       return_tensors="pt")
    labels = torch.tensor([1] * input_shape[0])
    # Batch size input_shape[0], sequence length input_shape[128]
    inputs = dict(inputs)
    inputs.update({"labels": labels})
    return inputs

In [None]:
flops_count, params_count = get_model_complexity_info(
            model, (1, 128), as_strings=True,
            input_constructor=partial(bert_input_constructor, tokenizer=tokenizer),
            print_per_layer_stat=False)

print('{:<30}  {:<8}'.format('Computational complexity: ', flops_count))
print("PyTorch {} Total Inference time = {} ms".format(device, format(sum(latency) * 1000 , '.2f')))
print('{:<30}  {:<8}'.format('Number of parameters: ', params_count))

## PyTorch dynamic quantization under the hood
How does pytorch determine the min and max of the range?

In [None]:
dynamic_model.roberta.encoder.layer[0].attention.self.query.weight()

In [None]:
model.roberta.encoder.layer[0].attention.self.query.weight.data

In [None]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.max()

In [None]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.min()

In [None]:
tmin = -0.5972
tmax = 0.7132

In [None]:
qmax = 127
qmin = -128
symmetric_qmin = -((qmax - qmin) / 2 + 1)
symmetric_qmax = (qmax - qmin) / 2
symmetric_qmin, symmetric_qmax

In [None]:
max_scale = max(abs(tmin / symmetric_qmin), abs(tmax / symmetric_qmax))
max_scale

In [None]:
tmin = max_scale*symmetric_qmin

In [None]:
tmax = max_scale*symmetric_qmax

In [None]:
tmin, tmax

In [None]:
(tmax - tmin) / (qmax - qmin)

In [None]:
aq = torch.round(model.roberta.encoder.layer[0].attention.self.query.weight.data / max_scale)
aq * max_scale

In [None]:
dynamic_model.roberta.encoder.layer[0].attention.self.query._packed_params

In [None]:
from dynamic_quant_ops import tensor_quant_scale

In [None]:
aq, scale = tensor_quant_scale(model.roberta.encoder.layer[0].attention.self.query.weight.data)

In [None]:
aq * scale

In [None]:
np.percentile(model.roberta.encoder.layer[0].attention.self.query.weight.data.detach().numpy(), 99.9)

In [None]:
max(abs(0.7132), abs(-0.5972)) / (2**(8-1)-1)

In [None]:

state_dict = np.load('state_dict.npz')

In [None]:
for key in state_dict.keys():
    print(key)

In [None]:
state_dict['bert.encoder.layer.0.attention.self.query._input_quantizer._amax']

In [None]:
state_dict['bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax']

In [None]:
import inspect
inspect.getsource(torch.ops.quantized.layer_norm)