# Tracing and Quantization with PyTorch and Torchscript

## [(PROTOTYPE) FX GRAPH MODE QUANTIZATION USER GUIDE](https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html)

In [3]:
!pip install --no-cache-dir torch==1.8.1 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.1
  Downloading https://download.pytorch.org/whl/cpu/torch-1.8.1-cp38-none-macosx_10_9_x86_64.whl (119.6 MB)
[K     |████████████████████████████████| 119.6 MB 3.5 MB/s eta 0:00:01    |█████████████████████▍          | 79.7 MB 893 kB/s eta 0:00:45
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.8.0
    Uninstalling torch-1.8.0:
      Successfully uninstalled torch-1.8.0
Successfully installed torch-1.8.1


## load a bert model

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

max_length=256
model_id="bert-base-cased-finetuned-mrpc"
# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, return_dict=False,torchscript=True)

# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"


paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']
model(**paraphrase)[0]
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

traced_model = torch.jit.trace(quantized_model, example_inputs_paraphrase)
torch.jit.save(traced_model, "bert_traced_eager_quant.pt")

  assert all(


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)

In [2]:
# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']


In [3]:
 model(**paraphrase)[0]

tensor([[-0.3495,  1.9004]], grad_fn=<AddmmBackward>)

## quantize

In [4]:
import torch.quantization.quantize_fx as quantize_fx
import copy

In [72]:
from torch.quantization.quantize_fx import prepare_fx, convert_fx
from torch.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig

# Full docs for supported qconfig for floating point modules/ops can be found in docs for quantization (TODO: link)
# Full docs for qconfig_dict can be found in the documents of prepare_fx (TODO: link)
qconfig_dict = {
    "object_type": [
        (nn.Embedding, float_qparams_weight_only_qconfig),
        (nn.Linear, default_dynamic_qconfig)
    ]
}
# Deepcopying the original model because quantization api changes the model inplace and we want
# to keep the original model for future comparison
model_to_quantize = copy.deepcopy(model)
prepared_model = prepare_fx(model_to_quantize, qconfig_dict)
print("prepared model:", prepared_model)
quantized_model = convert_fx(prepared_model)
print("quantized model", quantized_model)

ValueError: You cannot specify both input_ids and inputs_embeds at the same time

## eager quantization

In [73]:
import torch
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
print(quantized_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )
      

In [75]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 433.334163
Size (MB): 176.815643


In [123]:
from datasets import load_dataset,load_metric
import time

task = "mrpc"
split="validation"
all_datasets = load_dataset("glue", task)
metric = load_metric("glue", task)
dataset= all_datasets[split]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
max_length=128
padding='max_length'

def preprocess_function(examples):
    # Tokenize the texts
    texts = (examples['sentence1'], examples['sentence2'])
    result = tokenizer(*texts, padding=padding, max_length=max_length, truncation=True,return_tensors="pt")
    result["labels"] = examples["label"]
    return result


def do_test(raw_dataset,model,model_type):
    processed_dataset = raw_dataset.map(preprocess_function)
    processed_dataset = processed_dataset.select(range(2500))
    model_start = time.perf_counter()
#     model_type = 'neuron' if isinstance(model, torch.jit.ScriptModule) else 'torch'
    with torch.no_grad():
        for step, batch in enumerate(processed_dataset):
            input_ids = torch.tensor(batch['input_ids'])
            attention_mask = torch.tensor(batch['attention_mask'])
            token_type_ids = torch.tensor(batch['token_type_ids'])
            outputs = model(*[input_ids,attention_mask,token_type_ids])
            predictions = outputs[0][0].argmax().item()
            metric.add_batch(predictions=[predictions],references=[batch["labels"]])
        
    eval_metric = metric.compute()
    model_stop = time.perf_counter()
    total_time = round(model_stop - model_start,4)*1000
    average_time =  round(total_time/len(processed_dataset),4)
    return {'model_type':model_type,
            **eval_metric,
            'total_time':f"{total_time}ms",
            'average_time':f"{average_time}ms",
            'max_length':max_length,
            'samples': len(processed_dataset),
            'task': task
           }   


Reusing dataset glue (/Users/philipp/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


## `with torch.jit.optimized_execution(True)`



In [15]:
from datasets import load_dataset,load_metric
import time

task = "mrpc"
split="validation"
all_datasets = load_dataset("glue", task)
metric = load_metric("glue", task)
dataset= all_datasets[split]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
max_length=256
padding='max_length'

def preprocess_function(examples):
    # Tokenize the texts
    texts = (examples['sentence1'], examples['sentence2'])
    result = tokenizer(*texts, padding=padding, max_length=max_length, truncation=True,return_tensors="pt")
    result["labels"] = examples["label"]
    return result


def do_test(raw_dataset,model,model_type):
    processed_dataset = raw_dataset.map(preprocess_function)
    processed_dataset = processed_dataset.select(range(2500))
    if model_type == 'pytorch':
        with torch.no_grad():
            model_start = time.perf_counter()
            for step, batch in enumerate(processed_dataset):
                input_ids = torch.tensor(batch['input_ids'])
                attention_mask = torch.tensor(batch['attention_mask'])
                token_type_ids = torch.tensor(batch['token_type_ids'])
                outputs = model(*[input_ids,attention_mask,token_type_ids])
                predictions = outputs[0][0].argmax().item()
                metric.add_batch(predictions=[predictions],references=[batch["labels"]])
    else:
        with torch.jit.optimized_execution(True):
            for _ in range(5):
                input_ids = torch.tensor(processed_dataset[0]['input_ids'])
                attention_mask = torch.tensor(processed_dataset[0]['attention_mask'])
                token_type_ids = torch.tensor(processed_dataset[0]['token_type_ids'])
                output = model(*[input_ids,attention_mask,token_type_ids])
    #     model_type = 'neuron' if isinstance(model, torch.jit.ScriptModule) else 'torch'
            model_start = time.perf_counter()
            for step, batch in enumerate(processed_dataset):
                input_ids = torch.tensor(batch['input_ids'])
                attention_mask = torch.tensor(batch['attention_mask'])
                token_type_ids = torch.tensor(batch['token_type_ids'])
                outputs = model(*[input_ids,attention_mask,token_type_ids])
                predictions = outputs[0][0].argmax().item()
                metric.add_batch(predictions=[predictions],references=[batch["labels"]])

    eval_metric = metric.compute()
    model_stop = time.perf_counter()
    total_time = round(model_stop - model_start,4)*1000
    average_time =  round(total_time/len(processed_dataset),4)
    return {'model_type':model_type,
            **eval_metric,
            'total_time':f"{total_time}ms",
            'average_time':f"{average_time}ms",
            'max_length':max_length,
            'samples': len(processed_dataset),
            'task': task
           }   




Reusing dataset glue (/Users/philipp/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [16]:
model_res=do_test(dataset, model,'pytorch')
model_quantized_res = do_test(dataset, quantized_model,'quantized')

import pandas as pd
df = pd.DataFrame([model_res,model_quantized_res])
df.head()

Loading cached processed dataset at /Users/philipp/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-439be8df85489197.arrow
Loading cached processed dataset at /Users/philipp/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-439be8df85489197.arrow


Unnamed: 0,model_type,accuracy,f1,total_time,average_time,max_length,samples,task
0,pytorch,0.8472,0.894997,1513572.3ms,605.4289ms,256,2500,mrpc
1,quantized,0.834,0.887686,497903.0ms,199.1612ms,256,2500,mrpc


## Trace quantized model and save it

In [125]:
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']

traced_model = torch.jit.trace(quantized_model, example_inputs_paraphrase)
torch.jit.save(traced_model, "bert_traced_eager_quant.pt")

  assert all(


In [126]:
model_traced_quantized = torch.jit.load('bert_traced_eager_quant.pt')

model_traced_quantized_res = do_test(dataset, model_traced_quantized,'traced_quantized')


Loading cached processed dataset at /Users/philipp/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-ee1969f5d89c739f.arrow


In [127]:
df = pd.DataFrame([model_res,model_quantized_res,model_traced_quantized_res])
df.head()

Unnamed: 0,model_type,accuracy,f1,total_time,average_time,max_length,samples,task
0,pytorch,0.8472,0.894997,712516.2000000001ms,285.0065ms,128,2500,mrpc
1,quantized,0.8348,0.887802,205222.3ms,82.0889ms,128,2500,mrpc
2,traced_quantized,0.8348,0.887802,175211.6ms,70.0846ms,128,2500,mrpc


In [128]:
df['performance'] = df.apply(lambda x: str(round(float(df.query('model_type == "pytorch"')['average_time'][0].replace('ms',''))/float(x['average_time'].replace('ms','')),4))+"x",axis=1)
df.head()


Unnamed: 0,model_type,accuracy,f1,total_time,average_time,max_length,samples,task,performance
0,pytorch,0.8472,0.894997,712516.2000000001ms,285.0065ms,128,2500,mrpc,1.0x
1,quantized,0.8348,0.887802,205222.3ms,82.0889ms,128,2500,mrpc,3.4719x
2,traced_quantized,0.8348,0.887802,175211.6ms,70.0846ms,128,2500,mrpc,4.0666x


In [129]:
df.to_csv('results_2500_samples_128_length_mrpc.csv')