# Packaging Model

## About

- Package the given model using Torch Model Archive
- Write a custom handler to support pre processing and post processing

## Working directory

orignal model and the traced model we saved from before

In [1]:
!ls ../artifacts/model/

distilbert-base-uncased  distilbert-base-uncased__trace


directory contains tokenizer/ vocab / pytorch model

In [2]:
!ls ../artifacts/model/distilbert-base-uncased 

config.json	    setup_config.json	     tokenizer_config.json
index_to_name.json  special_tokens_map.json  training_args.bin
pytorch_model.bin   tokenizer.json	     vocab.txt


In [3]:
!ls ../artifacts/model/distilbert-base-uncased__trace

index_to_name.json  special_tokens_map.json  traced_model.pt
model_store	    tokenizer.json	     vocab.txt
setup_config.json   tokenizer_config.json


## Torch Model Archiver

TorchServe required the model and its dependant artifacts to be packaged in a single file. 

[torch-model-archiver](https://pypi.org/project/torch-model-archiver/) is a python package that can package the artifacts to a mar file

In [4]:
%%bash 

torch-model-archiver --help

usage: torch-model-archiver [-h] --model-name MODEL_NAME
                            [--serialized-file SERIALIZED_FILE]
                            [--model-file MODEL_FILE] --handler HANDLER
                            [--extra-files EXTRA_FILES]
                            [--runtime {python,python2,python3}]
                            [--export-path EXPORT_PATH]
                            [--archive-format {tgz,no-archive,default}] [-f]
                            -v VERSION [-r REQUIREMENTS_FILE]

Torch Model Archiver Tool

optional arguments:
  -h, --help            show this help message and exit
  --model-name MODEL_NAME
                        Exported model name. Exported file will be named as
                        model-name.mar and saved in current working directory if no --export-path is
                        specified, else it will be saved under the export path
  --serialized-file SERIALIZED_FILE
                        Path to .pt or .pth file containing state_dic

package the model artifact and actual handler code

In [5]:
%%bash

cd ..
pwd

ARTIFACT_BASE_DIR="artifacts/model/distilbert-base-uncased__trace"

MODEL_NAME="pt_classifier"
MODEL_VERSION="1.0"
MODEL_STORE="${ARTIFACT_BASE_DIR}/model_store"
MODEL_SERIALIZED_FILE="${ARTIFACT_BASE_DIR}/traced_model.pt"

TOKENIZER_FILES="${ARTIFACT_BASE_DIR}/tokenizer_config.json,${ARTIFACT_BASE_DIR}/special_tokens_map.json,${ARTIFACT_BASE_DIR}/vocab.txt,${ARTIFACT_BASE_DIR}/tokenizer.json"
MODEL_EXTRA_FILES="${ARTIFACT_BASE_DIR}/index_to_name.json,${ARTIFACT_BASE_DIR}/setup_config.json,${TOKENIZER_FILES}"




mkdir -p $MODEL_STORE

torch-model-archiver --model-name ${MODEL_NAME} \
--version ${MODEL_VERSION} \
--serialized-file ${MODEL_SERIALIZED_FILE} \
--export-path ${MODEL_STORE} \
--extra-files ${MODEL_EXTRA_FILES} \
--handler ./serving/handler.py \
--force



/home/jupyter/tutorials/personal/pydata_bert




In [6]:
# %load ../serving/handler.py
import json
import logging
import os
import time
from abc import ABC
from collections.abc import Iterable
import transformers
import ast
import torch

import numpy as np
from ts.metrics.dimension import Dimension

logger = logging.getLogger(__name__)

from ts.torch_handler.base_handler import BaseHandler

from ts.utils.util import map_class_to_label

import time


logger = logging.getLogger(__name__)
logger.info("Transformers version %s",transformers.__version__)

class CustomHandler(BaseHandler, ABC):
    """
    Transformers handler class for sequence classification.
    """

    def __init__(self):
        super(CustomHandler, self).__init__()
        self.initialized = False

    def initialize(self, ctx):

        
        self.manifest = ctx.manifest
        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)

        self.device = torch.device(
            "cuda:" + str(properties.get("gpu_id"))
            if torch.cuda.is_available() and properties.get("gpu_id") is not None
            else "cpu"
        )
        
        # read configs for the mode, model_name, etc. from setup_config.json
        setup_config_path = os.path.join(model_dir, "setup_config.json")
        if os.path.isfile(setup_config_path):
            with open(setup_config_path) as setup_config_file:
                self.setup_config = json.load(setup_config_file)
        else:
            logger.warning("Missing the setup_config.json file.")


        # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
        # further setup config can be added.
        if self.setup_config["save_mode"] == "jit":
            self.model = torch.jit.load(model_pt_path, map_location=self.device)
        elif self.setup_config["save_mode"] == "original":
            self.model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir)

            self.model.to(self.device)
            
        else:
            logger.warning("Missing the checkpoint or state_dict.")

            
        
        self.top_k = self.setup_config["top_k"]
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir 
                                                                    , do_lower_case=self.setup_config["do_lower_case"]
                                                                    , torchscript=True)

      
        self.model.eval()

        logger.info(
            "Transformer model from path %s loaded successfully", model_dir
        )

        # Read the mapping file, index to object name
        mapping_file_path = os.path.join(model_dir, "index_to_name.json")
        
        if os.path.isfile(mapping_file_path):
            with open(mapping_file_path) as f:
                self.mapping = json.load(f)
        else:
            logger.warning("Missing the index_to_name.json file.")
        
        self.initialized = True

    def preprocess(self, requests):
        """Basic text preprocessing, based on the user's chocie of application mode.
        Args:
            requests (str): The Input data in the form of text is passed on to the preprocess
            function.
        Returns:
            list : The preprocess function returns a list of Tensor for the size of the word tokens.
        """
        input_ids_batch = None
        attention_mask_batch = None
        for idx, data in enumerate(requests):
            request = data.get("data")
            if request is None:
                request = data.get("body")
            if isinstance(request, (bytes, bytearray)):
                request = request.decode('utf-8')

            input_text = request['text']
            max_length = self.setup_config["max_length"]
            logger.info("Received text: '%s'", input_text)

            # preprocessing text for sequence_classification and token_classification.
            inputs = self.tokenizer.encode_plus(input_text, max_length=int(max_length), pad_to_max_length=True, add_special_tokens=True, return_tensors='pt')
            
            
            input_ids = inputs["input_ids"].to(self.device)
            attention_mask = inputs["attention_mask"].to(self.device)
            # making a batch out of the recieved requests
            # attention masks are passed for cases where input tokens are padded.
            if input_ids.shape is not None:
                if input_ids_batch is None:
                    input_ids_batch = input_ids
                    attention_mask_batch = attention_mask
                else:
                    input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
                    attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)
        
        input_ids_batch = input_ids_batch.to(self.device)
        attention_mask_batch = attention_mask_batch.to(self.device)
        
        return (input_ids_batch, attention_mask_batch)

    def inference(self, input_batch):

        
        input_ids_batch, attention_mask_batch = input_batch
        inferences = []
        
        predictions = self.model(input_ids_batch, attention_mask_batch)
        
#         ps = torch.nn.functional.softmax(predictions.logits, dim=1)
#         probs, classes = torch.topk(ps, self.top_k, dim=1)
#         probs = probs.tolist()
#         classes = classes.tolist()

#         inferences = map_class_to_label(probs, self.mapping, classes)
        
        num_rows, num_cols = predictions[0].shape
        for i in range(num_rows):
            ps = torch.nn.functional.softmax(predictions[i], dim=1)
            probs, classes = torch.topk(ps, self.top_k, dim=1)
            probs = probs.tolist()
            classes = classes.tolist()
        
            friendly_labels = map_class_to_label(probs, self.mapping, classes)
            inferences.append(friendly_labels)


        return inferences

    def postprocess(self, inference_output):

        return inference_output
   
    
    def handle(self, data, context):

        # It can be used for pre or post processing if needed as additional request
        # information is available in context
        
        start_time = time.time()
        
        self.context = context
        metrics = self.context.metrics
        
        data_preprocess = self.preprocess(data)
        data_inference = self.inference(data_preprocess)
        data_postprocess = self.postprocess(data_inference)
        
        
        
        stop_time = time.time()
        metrics.add_time('HandlerTime', round((stop_time - start_time) * 1000, 2), None, 'ms')
        
        return data_postprocess


if you would live to serve through Docker, lets copy the `model_store` artifact relative to the DockerFile folder

In [7]:
%%bash
cd .. 

rm -rf serving/model_store
mkdir -p serving/model_store

cp artifacts/model/distilbert-base-uncased__trace/model_store/* serving/model_store
cp artifacts/model/distilbert-base-uncased__trace/setup_config.json serving/model_store/

## Torchserve

> TorchServe is a performant, flexible and easy to use tool for serving PyTorch eager mode and torschripted models.

Ref: [TorchServe Docs](https://pytorch.org/serve/)

below command starts torchserve

In [8]:
%%bash --bg
cd ..
torchserve --ts-config ./serving/config.properties \
--start --model-store ./serving/model_store --ncs



In [9]:
!ls ../logs/

access_log.log	model_log.log  model_metrics.log  ts_log.log  ts_metrics.log


In [10]:
!tail ../logs/model_log.log

2021-10-25 17:52:13,676 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Transformers version 4.11.1
2021-10-25 17:52:35,354 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Transformer model from path /tmp/models/84448fbb0cf64f8fa122a52b62531894 loaded successfully
2021-10-25 17:53:10,874 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Received text: 'herbal tea'
2021-10-25 17:53:10,874 [WARN ] W-9000-pt_classifier_1.0-stderr MODEL_LOG - Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021-10-25 17:53:19,419 [INFO ] W-9000-pt_classifier_1.0-stdout MODEL_LOG - Received text: 'herbal tea'
2021-10-25 17:53:23,218 [INFO ] W-9000-pt_classifier_1.0-stdout M

In [11]:
!cat ../serving/config.properties 

load_models=all
inference_address=http://0.0.0.0:9080
management_address=http://0.0.0.0:9081
metrics_address=http://0.0.0.0:9082
model_store=model_store
async_logging=true

below command stops torchserve

In [12]:
#torchserve --stop

List all the models loaded

In [13]:
!curl "http://localhost:9081/models"

{
  "models": [
    {
      "modelName": "pt_classifier",
      "modelUrl": "pt_classifier.mar"
    }
  ]
}


get details on the model `pt_classifier`

In [14]:
!curl http://localhost:9081/models/pt_classifier

[
  {
    "modelName": "pt_classifier",
    "modelVersion": "1.0",
    "modelUrl": "pt_classifier.mar",
    "runtime": "python",
    "minWorkers": 1,
    "maxWorkers": 1,
    "batchSize": 1,
    "maxBatchDelay": 100,
    "loadedAtStartup": true,
    "workers": [
      {
        "id": "9000",
        "startTime": "2021-10-25T17:52:12.170Z",
        "status": "READY",
        "memoryUsage": 3148378112,
        "pid": 6903,
        "gpu": true,
        "gpuUsage": "gpuId::0 utilization.gpu [%]::0 % utilization.memory [%]::0 % memory.used [MiB]::1640 MiB"
      }
    ]
  }
]


sample prediction

In [15]:
! curl -X POST http://localhost:9080/predictions/pt_classifier \
        -H 'Content-Type: application/json' \
        -d '{"text":"herbal tea","request_id":"test_id"}' \
        -w  "\nelasped time (sec):%{time_total}\n"

[
  {
    "GROCERY": 0.9971381425857544,
    "HEALTH_PERSONAL_CARE": 0.002372726332396269,
    "PET_SUPPLIES": 0.00013340394070837647,
    "KITCHEN": 8.001828246051446e-05,
    "SHOES": 3.6940335121471435e-05
  }
]
elasped time (sec):0.036406


sample prediction from a file

In [16]:
%%bash
cd ..
curl -X POST http://localhost:9080/predictions/pt_classifier \
        -H 'Content-Type: application/json' \
        -d @serving/sample_input.json \
        -w "\nelasped time (sec):%{time_total}\n"

[
  {
    "GROCERY": 0.9971381425857544,
    "HEALTH_PERSONAL_CARE": 0.002372726332396269,
    "PET_SUPPLIES": 0.00013340394070837647,
    "KITCHEN": 8.001828246051446e-05,
    "SHOES": 3.6940335121471435e-05
  }
]
elasped time (sec):0.034849


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   265  100   214  100    51   6154   1466 --:--:-- --:--:-- --:--:--  7571
100   265  100   214  100    51   6140   1463 --:--:-- --:--:-- --:--:--  7571


In [None]:
import requests

In [None]:
payload = {"text":"herbal tea","request_id":"test_id"}

endpoint = "http://localhost:9080/predictions/pt_classifier"

res = requests.post(endpoint, json = payload)

res.json()