# Statically Quantize Roberta

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from transformers import glue_compute_metrics
import sklearn
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import os
from argparse import Namespace

In [3]:
from dynamic_quant_roberta import QuantRobertaForSequenceClassification
from transformers import RobertaForSequenceClassification, AutoTokenizer
from transformers.data.metrics import simple_accuracy

In [24]:
qmodel = QuantRobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [25]:
from datasets import load_dataset
dataset = load_dataset('glue', 'mrpc', split='validation')
# dataset = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (/home/jjc/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [26]:
tokenizer.decode(tokenizer(dataset[0]['sentence1'], dataset[0]['sentence2'])['input_ids'])

'<s>He said the foodservice pie business doesn \'t fit the company\'s long-term growth strategy.</s></s>" The foodservice pie business does not fit our long-term growth strategy.</s>'

In [27]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

dataset = dataset.map(encode, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [28]:
dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 408
})

In [29]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [34]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)


# 

In [35]:
def eval_model(model, dataloader):
    
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = 'cuda'
    model.eval()
    preds = None

    for i, batch in enumerate(tqdm(dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            tmp_eval_loss, logits = outputs[:2]
            loss = outputs[0]
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = batch['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, batch['labels'].detach().cpu().numpy(), axis=0)
        if i % 10 == 0:
    #         print(f"loss: {loss}")
            pass

    preds = np.argmax(preds, axis=1)

    print(f'accuracy: {simple_accuracy(preds, out_label_ids)}')

In [36]:
torch.cuda.is_available()

True

In [None]:
eval_model(model, dataloader)
# os.environ['CUDA_LAUNCH_BLOCKING']='1'

: 

In [20]:
eval_model(qmodel, dataloader)

  0%|                                                                                                                                                                         | 0/13 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [21]:
torch.backends.quantized.engine = 'qnnpack'

In [22]:
dynamic_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)



In [23]:
eval_model(dynamic_model, dataloader)

  0%|                                                                                                                                                                         | 0/13 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [51]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(qmodel)
print_size_of_model(dynamic_model)

Size (MB): 498.672237
Size (MB): 498.672237
Size (MB): 242.152709


## PyTorch dynamic quantization under the hood
How does pytorch determine the min and max of the range?

In [19]:
dynamic_model.roberta.encoder.layer[0].attention.self.query.weight()

NameError: name 'dynamic_model' is not defined

In [None]:
model.roberta.encoder.layer[0].attention.self.query.weight.data

tensor([[ 0.0733, -0.0037, -0.0904,  ...,  0.1033,  0.0894, -0.1026],
        [-0.0520,  0.2053,  0.0730,  ...,  0.0648,  0.0631,  0.1287],
        [ 0.0869,  0.0704, -0.0509,  ..., -0.0434, -0.0070,  0.1100],
        ...,
        [-0.1867,  0.0172, -0.0314,  ..., -0.0504,  0.1023, -0.1159],
        [-0.2524,  0.0435,  0.0640,  ...,  0.0703, -0.1036,  0.0117],
        [-0.0512, -0.0864,  0.1022,  ..., -0.1887,  0.0045, -0.0540]])

In [None]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.max()

tensor(0.7132)

In [None]:
model.roberta.encoder.layer[0].attention.self.query.weight.data.min()

tensor(-0.5972)

In [None]:
tmin = -0.5972
tmax = 0.7132

In [None]:
qmax = 127
qmin = -128
symmetric_qmin = -((qmax - qmin) / 2 + 1)
symmetric_qmax = (qmax - qmin) / 2
symmetric_qmin, symmetric_qmax

(-128.5, 127.5)

In [None]:
max_scale = max(abs(tmin / symmetric_qmin), abs(tmax / symmetric_qmax))
max_scale

0.005593725490196078

In [None]:
tmin = max_scale*symmetric_qmin

In [None]:
tmax = max_scale*symmetric_qmax

In [None]:
tmin, tmax

(-0.718793725490196, 0.7132)

In [None]:
(tmax - tmin) / (qmax - qmin)

0.005615661668589004

In [None]:
aq = torch.round(model.roberta.encoder.layer[0].attention.self.query.weight.data / max_scale)
aq * max_scale

tensor([[ 0.0727, -0.0056, -0.0895,  ...,  0.1007,  0.0895, -0.1007],
        [-0.0503,  0.2070,  0.0727,  ...,  0.0671,  0.0615,  0.1287],
        [ 0.0895,  0.0727, -0.0503,  ..., -0.0447, -0.0056,  0.1119],
        ...,
        [-0.1846,  0.0168, -0.0336,  ..., -0.0503,  0.1007, -0.1175],
        [-0.2517,  0.0447,  0.0615,  ...,  0.0727, -0.1063,  0.0112],
        [-0.0503, -0.0839,  0.1007,  ..., -0.1902,  0.0056, -0.0559]])

In [None]:
dynamic_model.roberta.encoder.layer[0].attention.self.query._packed_params[0]

TypeError: 'LinearPackedParams' object is not subscriptable

In [None]:
from dynamic_quant_ops import tensor_quant_scale

In [None]:
aq, scale = tensor_quant_scale(model.roberta.encoder.layer[0].attention.self.query.weight.data)

In [None]:
aq * scale

tensor([[ 0.0727, -0.0056, -0.0895,  ...,  0.1007,  0.0895, -0.1007],
        [-0.0503,  0.2070,  0.0727,  ...,  0.0671,  0.0615,  0.1286],
        [ 0.0895,  0.0727, -0.0503,  ..., -0.0447, -0.0056,  0.1119],
        ...,
        [-0.1846,  0.0168, -0.0336,  ..., -0.0503,  0.1007, -0.1175],
        [-0.2517,  0.0447,  0.0615,  ...,  0.0727, -0.1063,  0.0112],
        [-0.0503, -0.0839,  0.1007,  ..., -0.1902,  0.0056, -0.0559]])

In [None]:
np.percentile(model.roberta.encoder.layer[0].attention.self.query.weight.data.detach().numpy(), 99.9)

0.382255209922794

In [None]:
max(abs(0.7132), abs(-0.5972)) / (2**(8-1)-1)

0.005615748031496063

In [None]:
state_dict = np.load('state_dict.npz')

In [None]:
for key in state_dict.keys():
    print(key)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.query._input_quantizer._amax
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.key._input_quantizer._amax
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.self.value._input_quantizer._amax
bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax
bert.encoder.layer.0.attention.self.qv_b_input_quantizer._amax
bert.encoder.layer.0.attention.self.av_a_input_quantizer._amax
bert.encoder.layer.0.attention.self.av_b_input_quantizer._amax
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attentio

In [None]:
state_dict['bert.encoder.layer.0.attention.self.query._input_quantizer._amax']

array(5.545955, dtype=float32)

In [None]:
state_dict['bert.encoder.layer.0.attention.self.qv_a_input_quantizer._amax']

array(6.7064476, dtype=float32)

In [None]:
import inspect
inspect.getsource(torch.ops.quantized.layer_norm)

TypeError: module, class, method, function, traceback, frame, or code object was expected, got builtin_function_or_method