# Inference with ONNX runtime
In this notebook, we explore Open Neural Network Exchange (ONNX) runtime and converting models in ONNX format. [ONNX Runtime](https://onnxruntime.ai/) is a cross-platform inference and training machine-learning accelerator. We are exploring that for quantization and for inferencing with smaller pruned models. For quantization, onnx supports conversion of floating point 32 values to int 8 values. We try that with our distilbert model.

In [None]:
import os
import pathlib
from dotenv import load_dotenv
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoModelForSequenceClassification
from src.data.s3_communication import S3Communication
import config
from transformers import AutoTokenizer
from torch import cuda
import transformers
from onnxruntime.quantization import quantize_dynamic, QuantType
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path
import onnxruntime as ort
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
device = 'cuda' if cuda.is_available() else 'cpu'
local_model_path = '/opt/app-root/src/aicoe-osc-demo/models/transformers/RELEVANCE'

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

In [3]:
# init s3 connector
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

## Retrieve the test dataset and the trained models

In [4]:
s3c.download_files_in_prefix_to_dir(
    config.BASE_TRAIN_TEST_DATASET_S3_PREFIX,
    config.BASE_PROCESSED_DATA)

In [5]:
test_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_test_split.csv'
test_data = pd.read_csv(test_data_path, index_col=0)
test_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

train_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_train_split.csv'
train_data = pd.read_csv(train_data_path, index_col=0)
train_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

In [6]:
trds = Dataset.from_pandas(train_data)
teds = Dataset.from_pandas(test_data.drop('label', axis=1))

climate_dataset = DatasetDict()

climate_dataset['train'] = trds
climate_dataset['test'] = teds

In [35]:
def create_batches(data_df, batch_size=32):
    encoded_dataset = list()
    batch = list()
    for df, row in data_df.iterrows():
        if len(batch) < batch_size:
            batch.append([row['question'], row['sentence']])
        else:
            encoded_dataset.append(tokenizer(batch,
                                             truncation=True,
                                             return_tensors='pt',
                                             padding=True))
            batch = [[row['question'], row['sentence']]]

    if batch:
        encoded_dataset.append(tokenizer(batch,
                                         truncation=True,
                                         return_tensors='pt',
                                         padding=True))
    return encoded_dataset


encoded_dataset = create_batches(test_data)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(local_model_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to('cpu')

In [21]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

# Convert to ONNX

In [25]:
output_path = Path("onnx-model/classifier.onnx")
onnx_convert.convert_pytorch(pipeline, opset=11, output=output_path, use_external_format=False)

Using framework PyTorch: 1.6.0
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
head_mask is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask']


  position_ids = self.position_ids[:, :seq_length]
  mask, torch.tensor(torch.finfo(scores.dtype).min)


In [26]:
output_int8_path = Path("onnx-model/classifier_int8.onnx")
quantize_dynamic(output_path, output_int8_path,
                 weight_type=QuantType.QUInt8)

Ignore MatMul due to non constant B: /[MatMul_47]
Ignore MatMul due to non constant B: /[MatMul_60]
Ignore MatMul due to non constant B: /[MatMul_129]
Ignore MatMul due to non constant B: /[MatMul_142]
Ignore MatMul due to non constant B: /[MatMul_211]
Ignore MatMul due to non constant B: /[MatMul_224]
Ignore MatMul due to non constant B: /[MatMul_293]
Ignore MatMul due to non constant B: /[MatMul_306]
Ignore MatMul due to non constant B: /[MatMul_375]
Ignore MatMul due to non constant B: /[MatMul_388]
Ignore MatMul due to non constant B: /[MatMul_457]
Ignore MatMul due to non constant B: /[MatMul_470]


In [87]:
session = ort.InferenceSession(output_path.as_posix())
session_int8 = ort.InferenceSession(output_int8_path.as_posix())

In [88]:
def predict(encoded_dataset):
    out, out_int8 = list(), list()
    for batch in encoded_dataset:
        input_ids = batch['input_ids'].numpy()
        attention_mask = batch['attention_mask'].numpy()
        input_feed = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }
        out.extend(session.run(input_feed=input_feed, output_names=['output_0'])[0])
        out_int8.extend(session_int8.run(input_feed=input_feed,  output_names=['output_0'])[0])
    return out, out_int8

In [89]:
out, out_int8 = predict(encoded_dataset)

In [91]:
test_data["pred"] = np.argmax(out, axis=-1)
test_data["pred_int8"] = np.argmax(out_int8, axis=-1)

In [92]:
def score(test_data, pred_type="pred"):
    #evalute performance
    groups = test_data.groupby("question")
    scores = {}
    for group, data in groups:
        pred = data[pred_type]
        true = data.label
        scores[group] = {}
        scores[group]["accuracy"] = accuracy_score(true, pred)
        scores[group]["f1_score"] = f1_score(true, pred)
        scores[group]["recall_score"] = recall_score(true, pred)
        scores[group]["precision_score"] = precision_score(true, pred)
        scores[group]["support"] = len(pred)

    # kpi wise performance metrics
    scores_df = pd.DataFrame(scores)
    return scores_df.loc['f1_score'].mean()

In [93]:
score(test_data)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.9179571080911388

In [94]:
score(test_data, pred_type='pred_int8')

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.9068992132430389

So we see here that the qunatized distilbert model gives a lesser f1 score of about 90.69% whereas the normal distilbert model gives 91.79%. However the model size of the quantized model is 65.1Mb compared to 255.4Mb of the original model. With losing 1.1% in f1 score, we get almost 4 times size compression.