In [None]:
import os
import getpass

import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
import tensorrt as trt
import fsspec
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset

from tensorrt_inference.deberta.model import DebertaV2Model
from tensorrt_inference.deberta.onnx import convert_to_onnx
from tensorrt_inference.deberta.backend import build_engine, TRTModel

## Load original model

In [None]:
model_name = 'microsoft/mdeberta-v3-base'
model = DebertaV2Model.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)  

## Convert to ONNX

In [None]:
batch_size = 200

In [None]:
onnx_model_filename = os.path.join("/home", getpass.getuser(), "mdeberta.onnx")

In [None]:
convert_to_onnx(
    output_file=onnx_model_filename,
    model=model,
    seq_len=77,
    batch_size=batch_size
)

## Convert to TensorRT

In [None]:
trt_model_filename = os.path.join("/home", getpass.getuser(), "trt_mdeberta")

In [None]:
batch_size = 512

In [None]:
build_engine(
    output_trt_model_file=trt_model_filename,
    onnx_model_file=onnx_model_filename,
    min_batch_size=batch_size,
    optimal_batch_size=batch_size,
    max_batch_size=batch_size,
    precision="fp16",
    log_level=trt.Logger.INFO
)

## Run inference

### Dataset

In [None]:
fs = fsspec.filesystem('hdfs')
with fs.open('/user/g.racic/golden_test_set_sampled.csv', 'r') as f:
    df = pd.read_csv(f)

In [None]:
class TextDataset(Dataset):
    
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        text = str(row.title) + ' ' + str(row.description) if row.description else str(row.title)
        txt_tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
        input_ids = txt_tokens["input_ids"]
        attention_mask = txt_tokens["attention_mask"]
        return {
            'input_ids': input_ids,
            'att_mask': attention_mask,
        }

### Inference

In [None]:
trt_model = TRTModel(trt_model_filename)

In [None]:
timings = []
nb_warmup_batches = 10
dataset = TextDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=8)
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
for i, data in enumerate(tqdm(dataloader, total=df.shape[0]//batch_size+1)):
    
    inputs = [data['input_ids'].squeeze().to("cuda"), data['att_mask'].squeeze().to("cuda")]
    if i <= nb_warmup_batches:
        starter.record()
        outputs = trt_model(inputs)
        ender.record()
        torch.cuda.synchronize()
    else:
        starter.record()
        outputs = trt_model(inputs)
        ender.record()
        torch.cuda.synchronize()
        inf_time = starter.elapsed_time(ender)
        timings.append(inf_time)

In [None]:
print(f"Average inference time {np.mean(timings[:-1]):.2f}+/-{np.std(timings[:-1]):.2f} ms")
print(f"Throughput: {(df.shape[0]-(nb_warmup_batches*batch_size))*1000/np.sum(timings):.2f} samples per second")