# Optimizing Model

## About

- Sample code to load artifacts / run inference
- Quantizing model
- Converting the Bert model with torch script

## Import

In [1]:
import pathlib
import sklearn
import datasets
import pandas as pd
import torch

import numpy as np
import transformers
import os
import json
from ts.utils.util  import map_class_to_label
from tqdm import tqdm, trange
import time
import utils

## Load model/tokenizer artifact

In [56]:
base_model= "distilbert-base-uncased"
model_dir =f'../artifacts/model/{base_model}'

load model

In [3]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, return_dict=False)

a utiliy method provided by torchserver requires labels to also be strings

In [4]:
id2label_str = {str(key): value for key, value in model.config.id2label.items()}
id2label_str

{'0': 'ACCESSORY',
 '1': 'BOOT',
 '2': 'CELLULAR_PHONE_CASE',
 '3': 'CHAIR',
 '4': 'EARRING',
 '5': 'FINEEARRING',
 '6': 'FINENECKLACEBRACELETANKLET',
 '7': 'FINERING',
 '8': 'GROCERY',
 '9': 'HANDBAG',
 '10': 'HARDWARE_HANDLE',
 '11': 'HAT',
 '12': 'HEALTH_PERSONAL_CARE',
 '13': 'HOME',
 '14': 'HOME_BED_AND_BATH',
 '15': 'HOME_FURNITURE_AND_DECOR',
 '16': 'JANITORIAL_SUPPLY',
 '17': 'KITCHEN',
 '18': 'LAMP',
 '19': 'LIGHT_BULB',
 '20': 'LIGHT_FIXTURE',
 '21': 'OFFICE_PRODUCTS',
 '22': 'OUTDOOR_LIVING',
 '23': 'PET_SUPPLIES',
 '24': 'RUG',
 '25': 'SANDAL',
 '26': 'SHOES',
 '27': 'SOFA',
 '28': 'SPORTING_GOODS',
 '29': 'TABLE',
 '30': 'WALL_ART'}

load tokenizer

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
                model_dir
            )

use gpu if available

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## Load dataset

In [7]:
dataset_path = '../artifacts/dataset_processed'

In [8]:
raw_datasets = datasets.load_from_disk(dataset_path)

In [9]:
columns = set(raw_datasets['test'].column_names ) - set(['text','label'])
columns

{'brand', 'item_id', 'item_name', 'main_image_id', 'node'}

In [10]:
set(raw_datasets.column_names )

{'test', 'train', 'valid'}

tokenize the entire dataset

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=columns)

  0%|          | 0/73 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

In [12]:
tokenized_datasets['test'][0].keys()

dict_keys(['attention_mask', 'input_ids', 'label', 'text'])

In [13]:
# size of input length of model

In [14]:
len ( tokenized_datasets['test'][0]['input_ids'] )

512

create a subset of the test dataset.
feel free to use the full dataset if running on GPU

In [15]:
subset = tokenized_datasets["test"].num_rows
subset = 100

test_dataset = tokenized_datasets["test"].shuffle(42).select(range(subset)) 
test_dataset.set_format(type='torch' )

In [16]:
test_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 100
})

## Predicting on Test dataset

In [17]:
utils.prediction_batch(model,test_dataset,device='cpu' )

100%|██████████| 4/4 [00:10<00:00,  2.51s/it]


{'accuracy': 0.97}

## Optimization: Quantizing model

Pytorch supports three types of Quantization:
1. Dynamic Qunatization
2. Static Quantization
3. Qunatization Aware Training 

In this notebook, we look at Dyanmic Quantization.

**Dynamic Qunatization**, quantizes the weights . The activations are quantized on the fly.

Currently Pytorch doesn't support dynamic quantization on GPU 

In [18]:
model.to('cpu')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In the below code, we quantize all the `Linear` layers to int8 from float32.

In [19]:
quantized_model_int8 = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_model_int8


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (out_lin): DynamicQuantizedLinear(in_features=768, out_features=768,

How much file storage did we gain ? 

In [20]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model_int8)

Size (MB): 267.947825
Size (MB): 138.734297


The quantized model reduced the file storage by half, which also translates to less memory and faster execution .

### Benchmark Quantization

Lets measure how much speedup and possible loss in accuracy , we gained from quantization

In [21]:
test_dataset.select([0])

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 1
})

In [22]:
def time_model_evaluation(model, dataset=test_dataset,device:str='cpu'):

    eval_start_time = time.time()
    
    result = utils.prediction_batch(model , dataset, device=device )
    
    eval_end_time = time.time()
    eval_duration_time_ms =  (eval_end_time - eval_start_time) * 1_000
    print(result)
    print("Evaluate total time (ms): {0:.1f}".format(eval_duration_time_ms))
    
def time_model_evaluation_single(model,dataset=test_dataset, device:str='cpu'):
    
    model = model.to(device)

    tokens_tensor = dataset['input_ids'][0:1]
    masks_tensors = dataset['attention_mask'][0:1]


    tokens_tensor = tokens_tensor.to(device)
    masks_tensors = masks_tensors.to(device)
    

    eval_start_time = time.time()
    
    
    res = model(tokens_tensor, masks_tensors)
    
    eval_end_time = time.time()
    eval_duration_time_ms =  (eval_end_time - eval_start_time) * 1_000

    #print("Evaluate total time (ms): {0:.1f}".format(eval_duration_time_ms))

Evaluate the original FP32 BERT model on gpu

In [23]:
time_model_evaluation(model, device='cuda')

100%|██████████| 4/4 [00:01<00:00,  2.09it/s]

{'accuracy': 0.97}
Evaluate total time (ms): 6096.2





In [24]:
%%timeit -r 3 -n 5

time_model_evaluation_single(model, device='cuda')

17.4 ms ± 1.39 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)


Evaluate the original FP32 BERT model on cpu

In [25]:
time_model_evaluation(model, device='cpu')

100%|██████████| 4/4 [00:10<00:00,  2.61s/it]

{'accuracy': 0.97}
Evaluate total time (ms): 12338.8





In [26]:
%%timeit -r 3 -n 5
time_model_evaluation_single(model, device='cpu')

116 ms ± 1.32 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)


Evaluate the qunatized int8 BERT model

In [27]:
time_model_evaluation(quantized_model_int8, device='cpu')

100%|██████████| 4/4 [00:08<00:00,  2.06s/it]

{'accuracy': 0.97}
Evaluate total time (ms): 9957.0





In [28]:
%%timeit -r 5 -n 10
time_model_evaluation_single(quantized_model_int8, device='cpu')

93.3 ms ± 6.21 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


dynamtic int8 qunatization is not supported on GPU

In [29]:
# time_model_evaluation(quantized_model_int8, device='cuda')
"""
Could not run 'quantized::linear_dynamic' with arguments from the 'CUDA' backend. 
This could be because the operator doesn't exist for this backend, or was omitted during 
the selective/custom build process (if using custom build). 
If you are a Facebook employee using PyTorch on mobile, 
please visit https://fburl.com/ptmfixes for possible resolutions. 
'quantized::linear_dynamic' is only available for these backends: [CPU, BackendSelect, Named, 
ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, 
AutogradMLC, Tracer, Autocast, Batched, VmapMode].
"""

"\nCould not run 'quantized::linear_dynamic' with arguments from the 'CUDA' backend. \nThis could be because the operator doesn't exist for this backend, or was omitted during \nthe selective/custom build process (if using custom build). \nIf you are a Facebook employee using PyTorch on mobile, \nplease visit https://fburl.com/ptmfixes for possible resolutions. \n'quantized::linear_dynamic' is only available for these backends: [CPU, BackendSelect, Named, \nADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, \nAutogradMLC, Tracer, Autocast, Batched, VmapMode].\n"

| Model | device | Accuracy | Time |
| --- | --- | --- |--- |
| original fp32 | cpu | .918 | 116 |
| original fp32 | gpu | .918 | 17 |
| quantized int8 | cpu | .919 | 83 |













the above results , were measured in the below config.  
You might need to reduce the OpenMP threads, if introducing parralelism in your application

In [30]:
print ( torch.__config__.parallel_info())

ATen/Parallel:
	at::get_num_threads() : 8
	at::get_num_interop_threads() : 8
OpenMP 201511 (a.k.a. OpenMP 4.5)
	omp_get_max_threads() : 8
Intel(R) oneAPI Math Kernel Library Version 2021.3-Product Build 20210617 for Intel(R) 64 architecture applications
	mkl_get_max_threads() : 8
Intel(R) MKL-DNN v2.1.2 (Git Hash 98be7e8afa711dc9b66c8ff3504129cb82013cdb)
std::thread::hardware_concurrency() : 16
Environment variables:
	OMP_NUM_THREADS : [not set]
	MKL_NUM_THREADS : [not set]
ATen parallel backend: OpenMP



## Optimization: TorchScript

TorchScript is a way to create serializable and optimizable models from PyTorch code

load model as torchscript

According to [Hugging Face docs](https://huggingface.co/transformers/torchscript.html), the `torchscript` flag

>This flag is necessary because most of the language models in this repository have tied weights between their Embedding layer and their Decoding layer. 
>
>TorchScript does not allow the export of models that have tied weights, it is therefore necessary to untie the weights beforehand.
>
>This implies that models instantiated with the torchscript flag have their Embedding layer and Decoding layer separate, which means that they should not be trained down the line. Training would de-synchronize the two layers, leading to unexpected results.
>
>This is not the case for models that do not have a Language Model head, as those do not have tied weights. 
These models can be safely exported without the torchscript flag.

In [31]:
script_model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, torchscript=True
                                                                               , return_dict=False)
script_tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir , torchscript=True)

create a dummy input to pass to our model

In [32]:
text = "men shoes"
res = script_tokenizer.encode_plus(text, return_tensors="pt", padding="max_length",truncation=True)

In [33]:
tokens_tensor = res['input_ids']
masks_tensors = res['attention_mask']

dummy_input = [tokens_tensor, masks_tensors]

In [34]:
dummy_input

[tensor([[ 101, 2273, 6007,  102,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0, 

Creating the trace by passing a dummy input

In [35]:
script_model = script_model.to(device)

tokens_tensor = tokens_tensor.to(device)
masks_tensors = masks_tensors.to(device)

traced_model = torch.jit.trace(script_model, [tokens_tensor, masks_tensors])

Creating the trace of quantized model by passing a dummy input

In [36]:
script_model = script_model.to('cpu')
quantized_model_int8 = torch.quantization.quantize_dynamic(
    script_model, {torch.nn.Linear}, dtype=torch.qint8
)


quantized_model_int8 = quantized_model_int8.to('cpu')
dummy_input = (tokens_tensor.to('cpu'), masks_tensors.to('cpu') )
quantized_model_int8 = quantized_model_int8.to('cpu')
traced_model_int8 = torch.jit.trace(quantized_model_int8, dummy_input )  

lets compute how long it takes to predict with traced model

In [37]:
time_model_evaluation(traced_model, device='cpu')
#prediction_batch(traced_model,test_dataset )

100%|██████████| 4/4 [00:10<00:00,  2.64s/it]

{'accuracy': 0.97}
Evaluate total time (ms): 13128.4





In [38]:
%%timeit -r 3 -n 5
time_model_evaluation_single(traced_model, device='cpu')

124 ms ± 1.68 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)


In [39]:
time_model_evaluation(traced_model, device='cuda')

100%|██████████| 4/4 [00:01<00:00,  2.05it/s]

{'accuracy': 0.97}
Evaluate total time (ms): 3444.9





In [40]:
%%timeit -r 3 -n 5
time_model_evaluation_single(traced_model, device='cuda')

17.6 ms ± 1.43 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)


In [41]:
%%timeit -r 3 -n 5
time_model_evaluation_single(traced_model_int8, device='cpu')

84 ms ± 2.75 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)


In [42]:
utils.prediction( model=traced_model_int8
                 , tokens_tensor=tokens_tensor
                 , masks_tensors=masks_tensors 
                 , id2label_str=id2label_str)

[{'SHOES': 0.8049256801605225,
  'SANDAL': 0.06582888960838318,
  'GROCERY': 0.060891248285770416,
  'HEALTH_PERSONAL_CARE': 0.030060164630413055,
  'BOOT': 0.02072826586663723}]

| Model | device | Accuracy | Time |
| --- | --- | --- |--- |
| original fp32 | cpu | .918 | 130 |
| original fp32 (trace) | cpu | .918 | 124 |
| original fp32 | gpu | .918 | 22 |
| original fp32 (trace) | gpu | .918 | 18 |
| quantized int8 | cpu | .919 | 104 |
| quantized int8 (trace) | cpu | .919 | 89.5 |

Save / load  traced model

In [43]:
model_dir_trace = f'{model_dir}__trace'

In [44]:
!mkdir -p {model_dir_trace}

In [45]:
torch.jit.save(traced_model, f"{model_dir_trace}/traced_model.pt")

In [46]:
loaded_model = torch.jit.load( f"{model_dir_trace}/traced_model.pt")
loaded_model.eval()

RecursiveScriptModule(
  original_name=DistilBertForSequenceClassification
  (distilbert): RecursiveScriptModule(
    original_name=DistilBertModel
    (embeddings): RecursiveScriptModule(
      original_name=Embeddings
      (word_embeddings): RecursiveScriptModule(original_name=Embedding)
      (position_embeddings): RecursiveScriptModule(original_name=Embedding)
      (LayerNorm): RecursiveScriptModule(original_name=LayerNorm)
      (dropout): RecursiveScriptModule(original_name=Dropout)
    )
    (transformer): RecursiveScriptModule(
      original_name=Transformer
      (layer): RecursiveScriptModule(
        original_name=ModuleList
        (0): RecursiveScriptModule(
          original_name=TransformerBlock
          (attention): RecursiveScriptModule(
            original_name=MultiHeadSelfAttention
            (dropout): RecursiveScriptModule(original_name=Dropout)
            (q_lin): RecursiveScriptModule(original_name=Linear)
            (k_lin): RecursiveScriptModule(origi

## Save Artifacts

For the next section, let us save the class label

In [47]:
with open(f'{model_dir}/index_to_name.json','w') as f:
    json.dump(id2label_str,f)

In [48]:
with open(f'{model_dir_trace}/index_to_name.json','w') as f:
    json.dump(id2label_str,f)

in a separate json config, let use store things like 
- was lower case used by the tokenizer
- max length of tokenizer
- num labels
- is the model a jit

In [49]:
tokenizer.model_max_length

512

In [50]:
tokenizer.max_len_single_sentence

510

In [51]:
tokenizer.save_pretrained(model_dir_trace)

('../artifacts/model/distilbert-base-uncased__trace/tokenizer_config.json',
 '../artifacts/model/distilbert-base-uncased__trace/special_tokens_map.json',
 '../artifacts/model/distilbert-base-uncased__trace/vocab.txt',
 '../artifacts/model/distilbert-base-uncased__trace/added_tokens.json',
 '../artifacts/model/distilbert-base-uncased__trace/tokenizer.json')

In [57]:
setup_config = {
 "model_name":"pt-original",
 "do_lower_case": tokenizer.do_lower_case,
 "num_labels":len(id2label_str),
 "save_mode":"original",
 "max_length":tokenizer.model_max_length,
 "captum_explanation": True,
 "base_model": base_model,
 "top_k": 5   

}

setup_config



{'model_name': 'pt-original',
 'do_lower_case': True,
 'num_labels': 31,
 'save_mode': 'original',
 'max_length': 512,
 'captum_explanation': True,
 'base_model': 'distilbert-base-uncased',
 'top_k': 5}

In [58]:
with open(f'{model_dir}/setup_config.json','w') as f:
    json.dump(setup_config,f)

In [59]:
setup_config_trace = {**setup_config}
setup_config_trace['model_name'] = "pt-jit"
setup_config_trace['captum_explanation'] = False
setup_config_trace['save_mode'] = 'jit'
setup_config_trace

{'model_name': 'pt-jit',
 'do_lower_case': True,
 'num_labels': 31,
 'save_mode': 'jit',
 'max_length': 512,
 'captum_explanation': False,
 'base_model': 'distilbert-base-uncased',
 'top_k': 5}

In [60]:
with open(f'{model_dir_trace}/setup_config.json','w') as f:
    json.dump(setup_config_trace,f)