# Optimizing Model

## About

- Sample code to load artifacts / run inference
- Quantizing model
- Converting the Bert model with torch script

## Import

In [55]:
import pathlib
import sklearn
import datasets
import pandas as pd
import torch

import numpy as np
import transformers
import os
import json
from ts.utils.util  import map_class_to_label
from tqdm import tqdm, trange
import time

## Load model/tokenizer artifact

In [2]:
model_dir ='../artifacts/model/amazon/'

load model

In [3]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir)

In [4]:
model.config

DistilBertConfig {
  "_name_or_path": "../artifacts/model/amazon/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "ABIS_DRUGSTORE",
    "1": "ABIS_LAWN_AND_GARDEN",
    "2": "ACCESSORY",
    "3": "ACCESSORY_OR_PART_OR_SUPPLY",
    "4": "AUTO_ACCESSORY",
    "5": "BABY_PRODUCT",
    "6": "BACKPACK",
    "7": "BATTERY",
    "8": "BEAUTY",
    "9": "BED",
    "10": "BED_FRAME",
    "11": "BENCH",
    "12": "BISS",
    "13": "BOOT",
    "14": "BRACELET",
    "15": "BREAD",
    "16": "CABINET",
    "17": "CELLULAR_PHONE_CASE",
    "18": "CHAIR",
    "19": "CHARGING_ADAPTER",
    "20": "CLEANING_AGENT",
    "21": "CLOCK",
    "22": "CLOTHES_HANGER",
    "23": "COFFEE",
    "24": "COMPUTER_ADD_ON",
    "25": "COMPUTER_COMPONENT",
    "26": "DAIRY_BASED_DRINK",
    "27": "DESK",
    "28": "DRINKING_CUP",
    "29": "EARRING",
    "30": "EDIBLE_

model label name to id and id->label name mapping is stord as part of model artifact

In [5]:
labels = model.config.label2id
labels

{'ABIS_DRUGSTORE': 0,
 'ABIS_LAWN_AND_GARDEN': 1,
 'ACCESSORY': 2,
 'ACCESSORY_OR_PART_OR_SUPPLY': 3,
 'AUTO_ACCESSORY': 4,
 'BABY_PRODUCT': 5,
 'BACKPACK': 6,
 'BATTERY': 7,
 'BEAUTY': 8,
 'BED': 9,
 'BED_FRAME': 10,
 'BENCH': 11,
 'BISS': 12,
 'BOOT': 13,
 'BRACELET': 14,
 'BREAD': 15,
 'CABINET': 16,
 'CELLULAR_PHONE_CASE': 17,
 'CHAIR': 18,
 'CHARGING_ADAPTER': 19,
 'CLEANING_AGENT': 20,
 'CLOCK': 21,
 'CLOTHES_HANGER': 22,
 'COFFEE': 23,
 'COMPUTER_ADD_ON': 24,
 'COMPUTER_COMPONENT': 25,
 'DAIRY_BASED_DRINK': 26,
 'DESK': 27,
 'DRINKING_CUP': 28,
 'EARRING': 29,
 'EDIBLE_OIL_VEGETABLE': 30,
 'ELECTRONIC_ADAPTER': 31,
 'FILE_FOLDER': 32,
 'FINEEARRING': 33,
 'FINENECKLACEBRACELETANKLET': 34,
 'FINEOTHER': 35,
 'FINERING': 36,
 'FLAT_SCREEN_DISPLAY_MOUNT': 37,
 'FLAT_SHEET': 38,
 'FOOD_SERVICE_SUPPLY': 39,
 'FURNITURE': 40,
 'FURNITURE_COVER': 41,
 'GROCERY': 42,
 'HANDBAG': 43,
 'HARDWARE': 44,
 'HARDWARE_HANDLE': 45,
 'HAT': 46,
 'HEADBOARD': 47,
 'HEADPHONES': 48,
 'HEALTH_PERSON

In [6]:
id2label = model.config.id2label

In [7]:
id2label

{0: 'ABIS_DRUGSTORE',
 1: 'ABIS_LAWN_AND_GARDEN',
 2: 'ACCESSORY',
 3: 'ACCESSORY_OR_PART_OR_SUPPLY',
 4: 'AUTO_ACCESSORY',
 5: 'BABY_PRODUCT',
 6: 'BACKPACK',
 7: 'BATTERY',
 8: 'BEAUTY',
 9: 'BED',
 10: 'BED_FRAME',
 11: 'BENCH',
 12: 'BISS',
 13: 'BOOT',
 14: 'BRACELET',
 15: 'BREAD',
 16: 'CABINET',
 17: 'CELLULAR_PHONE_CASE',
 18: 'CHAIR',
 19: 'CHARGING_ADAPTER',
 20: 'CLEANING_AGENT',
 21: 'CLOCK',
 22: 'CLOTHES_HANGER',
 23: 'COFFEE',
 24: 'COMPUTER_ADD_ON',
 25: 'COMPUTER_COMPONENT',
 26: 'DAIRY_BASED_DRINK',
 27: 'DESK',
 28: 'DRINKING_CUP',
 29: 'EARRING',
 30: 'EDIBLE_OIL_VEGETABLE',
 31: 'ELECTRONIC_ADAPTER',
 32: 'FILE_FOLDER',
 33: 'FINEEARRING',
 34: 'FINENECKLACEBRACELETANKLET',
 35: 'FINEOTHER',
 36: 'FINERING',
 37: 'FLAT_SCREEN_DISPLAY_MOUNT',
 38: 'FLAT_SHEET',
 39: 'FOOD_SERVICE_SUPPLY',
 40: 'FURNITURE',
 41: 'FURNITURE_COVER',
 42: 'GROCERY',
 43: 'HANDBAG',
 44: 'HARDWARE',
 45: 'HARDWARE_HANDLE',
 46: 'HAT',
 47: 'HEADBOARD',
 48: 'HEADPHONES',
 49: 'HEALTH_PE

a utiliy method provided by torchserver requires labels to also be strings

In [8]:
id2label_str = {str(key): value for key, value in id2label.items()}
id2label_str

{'0': 'ABIS_DRUGSTORE',
 '1': 'ABIS_LAWN_AND_GARDEN',
 '2': 'ACCESSORY',
 '3': 'ACCESSORY_OR_PART_OR_SUPPLY',
 '4': 'AUTO_ACCESSORY',
 '5': 'BABY_PRODUCT',
 '6': 'BACKPACK',
 '7': 'BATTERY',
 '8': 'BEAUTY',
 '9': 'BED',
 '10': 'BED_FRAME',
 '11': 'BENCH',
 '12': 'BISS',
 '13': 'BOOT',
 '14': 'BRACELET',
 '15': 'BREAD',
 '16': 'CABINET',
 '17': 'CELLULAR_PHONE_CASE',
 '18': 'CHAIR',
 '19': 'CHARGING_ADAPTER',
 '20': 'CLEANING_AGENT',
 '21': 'CLOCK',
 '22': 'CLOTHES_HANGER',
 '23': 'COFFEE',
 '24': 'COMPUTER_ADD_ON',
 '25': 'COMPUTER_COMPONENT',
 '26': 'DAIRY_BASED_DRINK',
 '27': 'DESK',
 '28': 'DRINKING_CUP',
 '29': 'EARRING',
 '30': 'EDIBLE_OIL_VEGETABLE',
 '31': 'ELECTRONIC_ADAPTER',
 '32': 'FILE_FOLDER',
 '33': 'FINEEARRING',
 '34': 'FINENECKLACEBRACELETANKLET',
 '35': 'FINEOTHER',
 '36': 'FINERING',
 '37': 'FLAT_SCREEN_DISPLAY_MOUNT',
 '38': 'FLAT_SHEET',
 '39': 'FOOD_SERVICE_SUPPLY',
 '40': 'FURNITURE',
 '41': 'FURNITURE_COVER',
 '42': 'GROCERY',
 '43': 'HANDBAG',
 '44': 'HARDWARE'

load tokenizer

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
                model_dir
            )

use gpu if available

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## Test Prediction

query we are predicting on

In [11]:
query = 'men shoes for work'
query  = "men vitamins"
query  = "herbal TEA"

get input and attention mask from the tokenizer

In [12]:
res = tokenizer.encode_plus(query, return_tensors="pt",  padding="max_length", truncation=True)
res

{'input_ids': tensor([[  101, 27849,  5572,   102]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [14]:
%%time
model_res = model(**res)
model_res

CPU times: user 86.9 ms, sys: 25.9 ms, total: 113 ms
Wall time: 15.2 ms


SequenceClassifierOutput(loss=None, logits=tensor([[ -7.6677,  -8.9506,  -7.0845,  -5.2786,  -7.1656,  -6.1171,  -9.9482,
          -9.1212,  -2.7499,  -7.2937,  -8.6023,  -8.4928,  -5.4319,  -8.9995,
          -5.5602,  -1.6690,  -6.5178,  -8.3231,  -7.7803,  -9.7673,  -6.5894,
          -8.0236, -10.5098,   0.4534,  -8.5425,  -8.0849,  -3.6514,  -8.6150,
          -3.0538,  -7.3485,  -1.2094,  -8.9694, -10.4731,  -6.9120,  -5.5614,
          -6.1381,  -8.4838,  -9.5964,  -9.0998,  -4.2063,  -5.1919,  -8.1918,
           4.2027,  -7.9540,  -6.4338,  -8.7222,  -7.1057,  -9.4445,  -9.8354,
          -2.9263,   0.4510,  -5.4933,  -5.2048,  -6.2273,  -5.9204,  -8.1743,
         -10.3333,  -7.3300,  -8.0885,  -5.3360, -11.0123,  -7.2948,  -2.0672,
          -8.7549,  -8.7169,  -8.9316,  -4.8772,  -6.4113,  -4.3336, -11.0367,
          -7.4192,  -9.4832,  -6.4128,  -3.0107,  -5.8093,  -6.4888,  -7.1651,
          -7.2252,  -9.9273,  -9.3188,  -7.8169,  -7.1867,  -7.4152,  -9.8344,
         

decode the predictions

In [15]:
topk = 5
ps = torch.nn.functional.softmax(model_res.logits, dim=1)
probs, classes = torch.topk(ps, topk, dim=1)
probs = probs.tolist()
classes = classes.tolist()

In [16]:
probs, classes

([[0.7232240438461304,
   0.25969767570495605,
   0.006111669819802046,
   0.0060967495664954185,
   0.0011588569032028317,
   0.000731807027477771,
   0.0005764953093603253,
   0.0004914401797577739,
   0.0002766131074167788,
   0.00024830058100633323,
   0.00020814224262721837,
   0.00019130650616716594,
   0.00018323035328648984,
   0.00010350668162573129,
   0.00010080193169414997,
   9.626983955968171e-05,
   5.7873698096955195e-05,
   5.095693632028997e-05,
   2.9587761673610657e-05,
   2.1597934392048046e-05]],
 [[101,
   42,
   23,
   50,
   30,
   15,
   90,
   62,
   89,
   8,
   49,
   73,
   28,
   106,
   26,
   84,
   39,
   68,
   66,
   40]])

helper method from torchserve

In [17]:
map_class_to_label(probs, id2label_str, classes)

[{'TEA': 0.7232240438461304,
  'GROCERY': 0.25969767570495605,
  'COFFEE': 0.006111669819802046,
  'HERB': 0.0060967495664954185,
  'EDIBLE_OIL_VEGETABLE': 0.0011588569032028317,
  'BREAD': 0.000731807027477771,
  'SKIN_MOISTURIZER': 0.0005764953093603253,
  'LEGUME': 0.0004914401797577739,
  'SKIN_CLEANING_AGENT': 0.0002766131074167788,
  'BEAUTY': 0.00024830058100633323,
  'HEALTH_PERSONAL_CARE': 0.00020814224262721837,
  'PANTRY': 0.00019130650616716594,
  'DRINKING_CUP': 0.00018323035328648984,
  'VITAMIN': 0.00010350668162573129,
  'DAIRY_BASED_DRINK': 0.00010080193169414997,
  'SAUCE': 9.626983955968171e-05,
  'FOOD_SERVICE_SUPPLY': 5.7873698096955195e-05,
  'NUTRITIONAL_SUPPLEMENT': 5.095693632028997e-05,
  'MEDICATION': 2.9587761673610657e-05,
  'FURNITURE': 2.1597934392048046e-05}]

## Load dataset

In [20]:
dataset_path = '../artifacts/dataset_processed/amazon'

In [21]:
raw_datasets = datasets.load_from_disk(dataset_path)

In [22]:
columns = set(raw_datasets['test'].column_names ) - set(['text','label'])
columns

{'brand', 'item_id', 'item_name', 'main_image_id', 'node'}

In [23]:
set(raw_datasets.column_names )

{'test', 'train', 'valid'}

tokenize the entire dataset

In [24]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=columns)

  0%|          | 0/84 [00:00<?, ?ba/s]

Loading cached processed dataset at ../artifacts/dataset_processed/amazon/test/cache-07925e711c10b453.arrow
Loading cached processed dataset at ../artifacts/dataset_processed/amazon/valid/cache-07925e711c10b453.arrow


In [25]:
tokenized_datasets['test'][0].keys()

dict_keys(['attention_mask', 'input_ids', 'label', 'text'])

In [None]:
# size of input length of model

In [26]:
len ( tokenized_datasets['test'][0]['input_ids'] )

512

create a subset of the test dataset.
feel free to use the full dataset if running on GPU

In [27]:
subset = tokenized_datasets["test"].num_rows
subset = 1_000

test_dataset = tokenized_datasets["test"].shuffle(42).select(range(subset)) 
test_dataset.set_format(type='torch' )

In [28]:
test_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text'],
    num_rows: 1000
})

## Predicting on Test dataset

Predicting using the hugging face trainer object.     

Optimized Batch inference workflow

In [29]:
# Define test trainer
test_trainer = transformers.Trainer(model) 
# Make prediction
raw_predictions, raw_label_ids, raw_metrics = test_trainer.predict(test_dataset) 
# Preprocess raw predictions
y_pred = np.argmax(raw_predictions, axis=1)

The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


In [30]:
metric_accuracy = datasets.load_metric('accuracy')

In [31]:
metric_accuracy.compute(predictions = y_pred, references = test_dataset['label'])

{'accuracy': 0.918}

In [32]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

Predicting using the model `__call__` dunder method

need to make sure batches can fit in memory

In [33]:
model( input_ids = test_dataset['input_ids'][0:5].to(device) 
      , attention_mask = test_dataset['attention_mask'][0:5].to(device) )

SequenceClassifierOutput(loss=None, logits=tensor([[ -5.0694,  -5.1765,  -8.7472,  -0.8668,  -5.7447,  -6.1302,  -7.2811,
          -8.6489,  -5.1904,  -4.6450,  -4.6303,  -2.8763,  -6.6729,  -4.6171,
          -6.1057,  -6.2088,  -5.0244,  11.6435,  -4.6505,  -6.0849,  -9.2568,
          -4.3728,  -8.5708,  -5.4817,  -4.7679,  -5.2058,  -9.7425,  -4.9433,
          -6.2621,  -7.7707,  -6.8868,  -9.0698,  -9.1490,  -6.0963,  -5.7131,
          -7.2520,  -7.7580,  -6.5021,  -4.8559, -10.4029,  -2.2170,  -4.1551,
          -4.8708,  -6.6222,  -8.4910,  -8.9385, -10.5362,  -9.2823,  -4.3964,
          -4.1102, -10.8039,  -2.9993,  -2.7675,  -3.8705,  -7.8226,  -4.2509,
          -8.1575, -11.0981,  -6.9510,  -6.2460,  -9.7539, -10.3530, -10.3915,
          -8.3798,  -8.6267,  -6.7258,  -5.2902,  -5.5456,  -7.4572,  -6.7893,
          -6.6350,  -5.5757,  -6.1036,  -8.0627,  -5.0114,  -5.9468,  -7.6404,
          -5.8532,  -4.6515,  -8.7958,  -2.2132,  -5.7307, -11.2271,  -8.5976,
         

helper method that we can use to compute accuracy on our full dataset. 

We can't use HF Trainer object , because it is hard to use with the torcshcript model

In [35]:
def prediction_batch(model, dataset, device = device, batch_size = 8):
    l = len(dataset)
    all_y_preds = []
    # make sure model is in eval mode ; not computing gradients
    model.eval()
    
    # feed model to cpu/gpu device
    model = model.to(device)
    
    # iterate our dataset in batches
    for ndx in trange(0, l, batch_size):
        
        # take precomputed inut and attention masks
        input_ids = dataset['input_ids'][ndx:ndx+batch_size].to(device) 
        attention_mask = dataset['attention_mask'][ndx:ndx+batch_size].to(device) 
        
        with torch.no_grad():        
            res = model( input_ids = input_ids, attention_mask = attention_mask )
            
            # output of torchscript model doesn't have logits property 
            #logits = res.logits.detach().cpu().numpy()
            
            logits = res[0].detach().cpu().numpy()
            
            y_preds = np.argmax(logits, axis=1)
            
            all_y_preds.extend(y_preds)
    
    # accuracy on whole dataset
    accuracy = metric_accuracy.compute(predictions = all_y_preds, references = dataset['label'])
    
    return accuracy

In [51]:
prediction_batch(model,test_dataset,device='cpu' )

100%|██████████| 125/125 [01:49<00:00,  1.14it/s]


{'accuracy': 0.918}

## Optimization: Quantizing model

Pytorch supports three types of Quantization:
1. Dynamic Qunatization
2. Static Quantization
3. Qunatization Aware Training 

In this notebook, we look at Dyanmic Quantization.

**Dynamic Qunatization**, quantizes the weights . The activations are quantized on the fly.

Currently Pytorch doesn't support dynamic quantization on GPU 

In [41]:
model.to('cpu')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In the below code, we quantize all the `Linear` layers to int8 from float32.

In [47]:
quantized_model_int8 = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_model_int8


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (k_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (v_lin): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
            (out_lin): DynamicQuantizedLinear(in_features=768, out_features=768,

In [None]:
How much file storage did we gain ? 

In [49]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model_int8)

Size (MB): 268.197041
Size (MB): 268.225625
Size (MB): 138.796889


In [None]:
The quantized model reduced the file storage by half, which also translates to less memory and faster execution .

### Benchmark Qunatization

Lets measure how much speedup and possible loss in accuracy , we gained from quantization

In [60]:
def time_model_evaluation(model, dataset=test_dataset,device:str='cpu'):
    eval_start_time = time.time()
    
    result = prediction_batch(model , dataset, device=device )
    
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    print(result)
    print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time))

Evaluate the original FP32 BERT model on gpu

In [65]:
time_model_evaluation(model, device='cuda')

100%|██████████| 125/125 [00:16<00:00,  7.57it/s]

{'accuracy': 0.918}
Evaluate total time (seconds): 16.6





Evaluate the original FP32 BERT model on cpu

In [63]:
time_model_evaluation(model, device='cpu')

100%|██████████| 125/125 [01:49<00:00,  1.15it/s]

{'accuracy': 0.918}
Evaluate total time (seconds): 109.3





Evaluate the qunatized int8 BERT model

In [67]:
time_model_evaluation(quantized_model_int8, device='cpu')

100%|██████████| 125/125 [01:32<00:00,  1.35it/s]

{'accuracy': 0.919}
Evaluate total time (seconds): 92.8





dynamtic int8 qunatization is not supported on GPU

In [3]:
# time_model_evaluation(quantized_model_int8, device='cuda')
"""
Could not run 'quantized::linear_dynamic' with arguments from the 'CUDA' backend. 
This could be because the operator doesn't exist for this backend, or was omitted during 
the selective/custom build process (if using custom build). 
If you are a Facebook employee using PyTorch on mobile, 
please visit https://fburl.com/ptmfixes for possible resolutions. 
'quantized::linear_dynamic' is only available for these backends: [CPU, BackendSelect, Named, 
ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, 
AutogradMLC, Tracer, Autocast, Batched, VmapMode].
"""

"\nCould not run 'quantized::linear_dynamic' with arguments from the 'CUDA' backend. \nThis could be because the operator doesn't exist for this backend, or was omitted during \nthe selective/custom build process (if using custom build). \nIf you are a Facebook employee using PyTorch on mobile, \nplease visit https://fburl.com/ptmfixes for possible resolutions. \n'quantized::linear_dynamic' is only available for these backends: [CPU, BackendSelect, Named, \nADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, \nAutogradMLC, Tracer, Autocast, Batched, VmapMode].\n"

| Model | device | Accuracy | Time |
| --- | --- | --- |--- |
| original fp32 | cpu | .918 | 109.3 |
| original fp32 | gpu | .918 | 16.6 |
| quantized int8 | cpu | .919 | 92.8 |


the above results , were measured in the below config.  
You might need to reduce the OpenMP threads, if introducing parralelism in your application

In [69]:
print ( torch.__config__.parallel_info())

ATen/Parallel:
	at::get_num_threads() : 8
	at::get_num_interop_threads() : 8
OpenMP 201511 (a.k.a. OpenMP 4.5)
	omp_get_max_threads() : 8
Intel(R) oneAPI Math Kernel Library Version 2021.3-Product Build 20210617 for Intel(R) 64 architecture applications
	mkl_get_max_threads() : 8
Intel(R) MKL-DNN v2.1.2 (Git Hash 98be7e8afa711dc9b66c8ff3504129cb82013cdb)
std::thread::hardware_concurrency() : 16
Environment variables:
	OMP_NUM_THREADS : [not set]
	MKL_NUM_THREADS : [not set]
ATen parallel backend: OpenMP



## Optimization: TorchScript

TorchScript is a way to create serializable and optimizable models from PyTorch code

load model as torchscript

According to [Hugging Face docs](https://huggingface.co/transformers/torchscript.html), the `torchscript` flag

>This flag is necessary because most of the language models in this repository have tied weights between their Embedding layer and their Decoding layer. 
>
>TorchScript does not allow the export of models that have tied weights, it is therefore necessary to untie the weights beforehand.
>
>This implies that models instantiated with the torchscript flag have their Embedding layer and Decoding layer separate, which means that they should not be trained down the line. Training would de-synchronize the two layers, leading to unexpected results.
>
>This is not the case for models that do not have a Language Model head, as those do not have tied weights. 
These models can be safely exported without the torchscript flag.

In [None]:
script_model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, torchscript=True)
script_tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir , torchscript=True)

create a dummy input to pass to our model

In [None]:
text = query
res = script_tokenizer.encode_plus(text, return_tensors="pt", padding="max_length",truncation=True)
res

In [None]:
tokens_tensor = res['input_ids']
masks_tensors = res['attention_mask']

dummy_input = [tokens_tensor, masks_tensors]

In [None]:
dummy_input

In [None]:
tokens_tensor.shape

Creating the trace by passing a dummy input

In [None]:

script_model = script_model.to(device)

tokens_tensor = tokens_tensor.to(device)
masks_tensors = masks_tensors.to(device)

traced_model = torch.jit.trace(script_model, [tokens_tensor, masks_tensors])


prediction using traced model

In [None]:
%%time
res = traced_model(tokens_tensor, segments_tensors)
res

In [None]:
predictions = list ( zip (labels , torch.softmax(res[0], dim=1).tolist()[0] ) )
predictions = sorted (predictions , key=lambda x:x[1] , reverse =True)
predictions

lets compute how long it takes to predict with traced model

In [None]:
time_model_evaluation(traced_model, device='cpu')
#prediction_batch(traced_model,test_dataset )

In [None]:
time_model_evaluation(traced_model, device='gpu')


update below table
| Model | device | Accuracy | Time |
| --- | --- | --- |--- |
| original fp32 | cpu | .918 | 109.3 |
| original fp32 | gpu | .918 | 16.6 |
| quantized int8 | cpu | .919 | 92.8 |

Save / load  traced model

In [None]:
model_dir_trace ='../artifacts/model/amazon_trace/'

In [None]:
torch.jit.save(traced_model, f"{model_dir_trace}/traced_model.pt")

In [None]:
loaded_model = torch.jit.load( f"{model_dir_trace}/traced_model.pt")
loaded_model.eval()

## Save Artifacts

For the next section, let us save the class label

In [None]:
with open(f'{model_dir}index_to_name.json','w') as f:
    json.dump(id2label_str,f)

In [None]:
with open(f'{model_dir_trace}index_to_name.json','w') as f:
    json.dump(id2label_str,f)

in a separate json config, let use store things like 
- was lower case used by the tokenizer
- max length of tokenizer
- num labels
- is the model a jit

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.max_len_single_sentence

In [None]:
tokenizer.save_pretrained(model_dir_trace)

In [None]:
setup_config = {
 "model_name":"pt-original",
 "do_lower_case": tokenizer.do_lower_case,
 "num_labels":len(id2label),
 "save_mode":"original",
 "max_length":tokenizer.model_max_length,
 "captum_explanation": True,
 "embedding_name": "bert",
 "top_k": 5   

}

setup_config



In [None]:
with open(f'{model_dir}setup_config.json','w') as f:
    json.dump(setup_config,f)

In [None]:
setup_config_trace = {**setup_config}
setup_config_trace['model_name'] = "pt-jit"
setup_config_trace['captum_explanation'] = False
setup_config_trace['save_mode'] = 'jit'
setup_config_trace

In [None]:
with open(f'{model_dir_trace}setup_config.json','w') as f:
    json.dump(setup_config_trace,f)