# Tracing and Quantization with PyTorch and Torchscript

## [(PROTOTYPE) FX GRAPH MODE QUANTIZATION USER GUIDE](https://pytorch.org/tutorials/prototype/fx_graph_mode_quant_guide.html)

In [1]:
!pip install --no-cache-dir torch==1.9.0 -f https://download.pytorch.org/whl/torch_stable.html 

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.9.0
  Downloading https://download.pytorch.org/whl/rocm4.2/torch-1.9.0%2Brocm4.2-cp36-cp36m-linux_x86_64.whl (995.4 MB)
[K     |████████████████████████████████| 995.4 MB 69.6 MB/s eta 0:00:01     |█████████████████████▊          | 676.7 MB 54.0 MB/s eta 0:00:06     |██████████████████████████████▎ | 942.8 MB 21.5 MB/s eta 0:00:03
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.4.0
    Uninstalling torch-1.4.0:
      Successfully uninstalled torch-1.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 1.0.61 requires nvidia-ml-py3, which is not installed.[0m
Successfully installed torch-1.9.0+rocm4.2


In [11]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-1.8.0-py3-none-any.whl (237 kB)
[K     |████████████████████████████████| 237 kB 21.1 MB/s eta 0:00:01
Collecting tqdm<4.50.0,>=4.27
  Downloading tqdm-4.49.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 11.1 MB/s eta 0:00:01
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp36-cp36m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 60.6 MB/s eta 0:00:01
Collecting pyarrow<4.0.0,>=1.0.0
  Downloading pyarrow-3.0.0-cp36-cp36m-manylinux2014_x86_64.whl (20.7 MB)
[K     |████████████████████████████████| 20.7 MB 59.0 MB/s eta 0:00:01
Installing collected packages: tqdm, xxhash, pyarrow, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.57.0
    Uninstalling tqdm-4.57.0:
      Successfully uninstalled tqdm-4.57.0
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 4.0.0
    Uninstalling pyarrow-4.0.0:
      Successfully uninstall

## load a bert model

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Build tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=False)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [5]:
# Setup some example inputs
sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

max_length=128
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
example_inputs_paraphrase = paraphrase['input_ids'], paraphrase['attention_mask'], paraphrase['token_type_ids']


In [6]:
 model(**paraphrase)[0][0].argmax().item()

1

## eager quantization

In [7]:
import torch
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
print(quantized_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout): Dropout(p=0.1, inplace=False)
            )
      

In [8]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 433.328557
Size (MB): 176.806533


In [15]:
from datasets import load_dataset,load_metric
import time

task = "mrpc"
split="validation"
all_datasets = load_dataset("glue", task)
metric = load_metric("glue", task)
dataset= all_datasets[split]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
max_length=128
padding='max_length'

def preprocess_function(examples):
    # Tokenize the texts
    texts = (examples['sentence1'], examples['sentence2'])
    result = tokenizer(*texts, padding=padding, max_length=max_length, truncation=True,return_tensors="pt")
    result["labels"] = examples["label"]
    return result


def do_test(raw_dataset,model,model_type,mode):
    processed_dataset = raw_dataset.map(preprocess_function)
    processed_dataset = processed_dataset.select(range(500))
    model_start = time.perf_counter()
#     model_type = 'neuron' if isinstance(model, torch.jit.ScriptModule) else 'torch'
    if mode == 'no_grad':
        with torch.no_grad():
            for step, batch in enumerate(processed_dataset):
                input_ids = torch.tensor(batch['input_ids'])
                attention_mask = torch.tensor(batch['attention_mask'])
                token_type_ids = torch.tensor(batch['token_type_ids'])
                outputs = model(*[input_ids,attention_mask,token_type_ids])
                predictions = outputs[0][0].argmax().item()
                metric.add_batch(predictions=[predictions],references=[batch["labels"]])
    else:
        with torch.inference_mode():
            for step, batch in enumerate(processed_dataset):
                input_ids = torch.tensor(batch['input_ids'])
                attention_mask = torch.tensor(batch['attention_mask'])
                token_type_ids = torch.tensor(batch['token_type_ids'])
                outputs = model(*[input_ids,attention_mask,token_type_ids])
                predictions = outputs[0][0].argmax().item()
                metric.add_batch(predictions=[predictions],references=[batch["labels"]])
        
    eval_metric = metric.compute()
    model_stop = time.perf_counter()
    total_time = round(model_stop - model_start,4)*1000
    average_time =  round(total_time/len(processed_dataset),4)
    return {'model_type':model_type,
            **eval_metric,
            'total_time':f"{total_time}ms",
            'average_time':f"{average_time}ms",
            'max_length':max_length,
            'samples': len(processed_dataset),
            'task': task,
            'mode':mode
           }   


Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


## Trace quantized model and save it

In [13]:
traced_model = torch.jit.trace(quantized_model, example_inputs_paraphrase)
torch.jit.save(traced_model, "bert_traced_eager_quant.pt")

  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors


In [14]:
model_traced_quantized = torch.jit.load('bert_traced_eager_quant.pt')

In [None]:
# normal
model_res_no_grad=do_test(dataset, model,'pytorch','no_grad')
model_res_inference_mode=do_test(dataset, model,'pytorch','inference_mode')
# quantized
model_quantized_res_no_grad = do_test(dataset, quantized_model,'quantized','no_grad')
model_quantized_res_inference_mode = do_test(dataset, quantized_model,'quantized','inference_mode')
# quantized_traced 
model_traced_quantized_res_no_grad = do_test(dataset, model_traced_quantized,'traced_quantized','no_grad')
model_traced_quantized_res_inference_mode = do_test(dataset, model_traced_quantized,'traced_quantized','inference_mode')

import pandas as pd
df = pd.DataFrame([model_res_no_grad,model_res_inference_mode,model_quantized_res_no_grad,model_quantized_res_inference_mode,model_traced_quantized_res_no_grad,model_traced_quantized_res_inference_mode])
df.head()


  0%|          | 0/408 [00:00<?, ?ex/s]

  0%|          | 0/408 [00:00<?, ?ex/s]

In [128]:
df['performance'] = df.apply(lambda x: str(round(float(df.query('model_type == "pytorch" && mode == "no_grad"')['average_time'][0].replace('ms',''))/float(x['average_time'].replace('ms','')),4))+"x",axis=1)
df.head()


Unnamed: 0,model_type,accuracy,f1,total_time,average_time,max_length,samples,task,performance
0,pytorch,0.8472,0.894997,712516.2000000001ms,285.0065ms,128,2500,mrpc,1.0x
1,quantized,0.8348,0.887802,205222.3ms,82.0889ms,128,2500,mrpc,3.4719x
2,traced_quantized,0.8348,0.887802,175211.6ms,70.0846ms,128,2500,mrpc,4.0666x


In [129]:
df.to_csv('pytorch_1_9_0_inference_mode_vs_no_grad.csv')