# Statically Quantize Roberta

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from transformers import glue_compute_metrics
import sklearn
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os
from argparse import Namespace

In [3]:
from dynamic_quant_roberta import QuantRobertaForSequenceClassification
from transformers import RobertaForSequenceClassification, AutoTokenizer
from transformers.data.metrics import simple_accuracy

In [122]:
qmodel = QuantRobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')
model = RobertaForSequenceClassification.from_pretrained('textattack/roberta-base-MRPC')

tokenizer = AutoTokenizer.from_pretrained('textattack/roberta-base-MRPC')

Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing QuantRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing QuantRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at textattack/roberta-base-MRPC were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoin

In [79]:
print('\n\n, Baseline Bert modules: \n',model)



, Baseline Bert modules: 
 RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bia

In [25]:
from datasets import load_dataset
dataset = load_dataset('glue', 'mrpc', split='validation')
# dataset = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (/home/jjc/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [26]:
tokenizer.decode(tokenizer(dataset[0]['sentence1'], dataset[0]['sentence2'])['input_ids'])

'<s>He said the foodservice pie business doesn \'t fit the company\'s long-term growth strategy.</s></s>" The foodservice pie business does not fit our long-term growth strategy.</s>'

In [27]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

dataset = dataset.map(encode, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [28]:
dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})

In [29]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [73]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)


# 

In [121]:
def eval_model(model, dataloader):
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     device = 'cuda'
    model.eval()
    preds = None
    model=model.to(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            tmp_eval_loss, logits = outputs[:2]
            loss = outputs[0]
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = batch['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
        if i % 10 == 0:
    #         print(f"loss: {loss}")
            pass

    preds = np.argmax(preds, axis=1)

    print(f'accuracy: {simple_accuracy(preds, out_label_ids)}')

In [111]:
torch.cuda.is_available()

True

In [112]:
eval_model(model, dataloader)
# os.environ['CUDA_LAUNCH_BLOCKING']='1'

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 408/408 [00:03<00:00, 107.07it/s]

accuracy: 0.9117647058823529





In [124]:
eval_model(qmodel, dataloader)

  0%|                                                                                                                                                         | 0/408 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [86]:
torch.backends.quantized.engine = 'qnnpack'

In [87]:
dynamic_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [88]:
eval_model(dynamic_model, dataloader)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 408/408 [12:15<00:00,  1.80s/it]

accuracy: 0.9117647058823529





In [90]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(qmodel)
print_size_of_model(dynamic_model)

Size (MB): 498.672237
Size (MB): 498.672237
Size (MB): 242.150917


## PyTorch dynamic quantization under the hood
How does pytorch determine the min and max of the range?

In [42]:
dynamic_model.roberta.encoder.layer[0].attention.self.query.weight()

tensor([[ 0.0720,  0.0055, -0.0831,  ...,  0.1108,  0.0775, -0.0831],
        [-0.0332,  0.1994,  0.0665,  ...,  0.0609,  0.0665,  0.1274],
        [ 0.0997,  0.0665, -0.0388,  ..., -0.0222, -0.0222,  0.1163],
        ...,
        [-0.1772,  0.0111, -0.0222,  ..., -0.0388,  0.0941, -0.1384],
        [-0.2714,  0.0332,  0.0775,  ...,  0.0720, -0.0886,  0.0166],
        [-0.0609, -0.0997,  0.1108,  ..., -0.1883,  0.0111, -0.0388]],
       size=(768, 768), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.00553779024630785,
       zero_point=0)

In [43]:
model.roberta.encoder.layer[0].attention.self.query.weight.data

tensor([[ 0.0742,  0.0071, -0.0829,  ...,  0.1099,  0.0795, -0.0827],
        [-0.0360,  0.2001,  0.0684,  ...,  0.0616,  0.0671,  0.1298],
        [ 0.0995,  0.0671, -0.0405,  ..., -0.0237, -0.0196,  0.1182],
        ...,
        [-0.1754,  0.0127, -0.0226,  ..., -0.0386,  0.0963, -0.1387],
        [-0.2694,  0.0316,  0.0793,  ...,  0.0741, -0.0882,  0.0150],
        [-0.0586, -0.0998,  0.1101,  ..., -0.1883,  0.0109, -0.0391]])

In [44]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.max()

tensor(0.7061)

In [45]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.min()

tensor(-0.5904)

In [46]:
tmin = -0.5972
tmax = 0.7132

In [47]:
qmax = 127
qmin = -128
symmetric_qmin = -((qmax - qmin) / 2 + 1)
symmetric_qmax = (qmax - qmin) / 2
symmetric_qmin, symmetric_qmax

(-128.5, 127.5)

In [48]:
max_scale = max(abs(tmin / symmetric_qmin), abs(tmax / symmetric_qmax))
max_scale

0.005593725490196078

In [49]:
tmin = max_scale*symmetric_qmin

In [50]:
tmax = max_scale*symmetric_qmax

In [51]:
tmin, tmax

(-0.718793725490196, 0.7132)

In [52]:
(tmax - tmin) / (qmax - qmin)

0.005615661668589004

In [53]:
aq = torch.round(model.roberta.encoder.layer[0].attention.self.query.weight.data / max_scale)
aq * max_scale

tensor([[ 0.0727,  0.0056, -0.0839,  ...,  0.1119,  0.0783, -0.0839],
        [-0.0336,  0.2014,  0.0671,  ...,  0.0615,  0.0671,  0.1287],
        [ 0.1007,  0.0671, -0.0392,  ..., -0.0224, -0.0168,  0.1175],
        ...,
        [-0.1734,  0.0112, -0.0224,  ..., -0.0392,  0.0951, -0.1398],
        [-0.2685,  0.0336,  0.0783,  ...,  0.0727, -0.0895,  0.0168],
        [-0.0559, -0.1007,  0.1119,  ..., -0.1902,  0.0112, -0.0392]])

In [83]:

dynamic_model.roberta.encoder.layer[0].attention.self.query._packed_params[0]

TypeError: 'LinearPackedParams' object is not subscriptable

In [55]:
from dynamic_quant_ops import tensor_quant_scale

In [56]:
aq, scale = tensor_quant_scale(model.roberta.encoder.layer[0].attention.self.query.weight.data)

In [57]:
aq * scale

tensor([[ 0.0720,  0.0055, -0.0831,  ...,  0.1108,  0.0775, -0.0831],
        [-0.0332,  0.1994,  0.0665,  ...,  0.0609,  0.0665,  0.1274],
        [ 0.0997,  0.0665, -0.0388,  ..., -0.0222, -0.0222,  0.1163],
        ...,
        [-0.1772,  0.0111, -0.0222,  ..., -0.0388,  0.0941, -0.1384],
        [-0.2714,  0.0332,  0.0775,  ...,  0.0720, -0.0886,  0.0166],
        [-0.0609, -0.0997,  0.1108,  ..., -0.1883,  0.0111, -0.0388]])

In [58]:
np.percentile(model.roberta.encoder.layer[0].attention.self.query.weight.data.detach().numpy(), 99.9)

0.3832016658186957

In [59]:
max(abs(0.7132), abs(-0.5972)) / (2**(8-1)-1)

0.005615748031496063

In [84]:
state_dict = np.load('state_dict.npz')

FileNotFoundError: [Errno 2] No such file or directory: 'state_dict.npz'

In [61]:
for key in state_dict.keys():
    print(key)

NameError: name 'state_dict' is not defined

In [None]:
state_dict['bert.encoder.layer.0.attention.self.query._input_quantizer._amax']

array(5.545955, dtype=float32)

In [None]:
state_dict['bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax']

array(6.7064476, dtype=float32)

In [None]:
import inspect
inspect.getsource(torch.ops.quantized.layer_norm)

TypeError: module, class, method, function, traceback, frame, or code object was expected, got builtin_function_or_method