## System Check

In [1]:
!nvidia-smi

Fri Nov 11 07:15:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   39C    P0    25W /  70W |   1117MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0


## Run Triton Inference Server

The Nvidia image is at `nvcr.io/nvidia/tritonserver:22.10-py3`

To run the docker image for Triton server, execute the following.


```
docker run --rm -it \
 -v $(pwd):/workshop \
  --name triton \
  -p 8888:8888 \
  --runtime=nvidia nvcr.io/nvidia/tritonserver:22.10-py3 \
  tritonserver \
  --backend-config=tensorflow,version=2 \
  --model-repository=/workshop/models \
  --exit-on-error=false \
  --repository-poll-secs=20 \
  --model-control-mode="poll"
```

## Triton Client for Inference

To install the client for Triton, run the following commands

```
pip install nvidia-pyindex
pip install tritonclient[all]
```

In [3]:
!pip install nvidia-pyindex
!pip install tritonclient[all]

Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25ldone
[?25h  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8418 sha256=9c7311423f22c89175c77a834afde701180fe577cef02604198e16e061d411bf
  Stored in directory: /home/ubuntu/.cache/pip/wheels/0e/62/68/8bb6aafc3cb47e3468055aebc10d004b55da43563d748aac9c
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tritonclient[all]
  Downloading tritonclient-2.27.0-py3-none-manylinux1_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m219.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting python-rapidjson>=

Check if the Triton Server is ready or not!

In [4]:
!curl -v localhost:8000/v2/health/ready

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: localhost:8000
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Length: 0
< Content-Type: text/plain
< 
* Connection #0 to host localhost left intact


## Create Torchscript Model

In [4]:
# import torch
# import os
# from os.path import join, dirname

# torch.cuda.set_per_process_memory_fraction(0.4, 0)
# torch.cuda.empty_cache()

# folder = "/workshop/models/pt/1"
# os.makedirs(folder, exist_ok=True)

# model = torch.nn.Module(...)
# sample_img = torch.zeros([1, 3, 32, 32], dtype=torch.float32)
# traced_model = torch.jit.trace(model.eval(), sample_img, strict=True)
# traced_model.save(join(folder, "model.pt"))



## Create Configuration for the PyTorch Model

In [15]:
configuration = """
name: "pt"
platform: "pytorch_libtorch"
max_batch_size : 0
input [
  {
    name: "input__0"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 32, 32 ]
    reshape { shape: [ 1, 3, 32, 32 ] }
  }
]
output [
  {
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ 10 ]
    reshape { shape: [ 10 ] }
  }
]
parameters: {
key: "INFERENCE_MODE"
    value: {
    string_value: "true"
    }
}
"""

with open('/home/ubuntu/work/models/pt/config.pbtxt', 'w') as file:
    file.write(configuration)

Get information regarding models

In [2]:
!curl -v localhost:8000/v2/models/pt

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/models/pt HTTP/1.1
> Host: localhost:8000
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Type: application/json
< Content-Length: 191
< 
* Connection #0 to host localhost left intact
{"name":"pt","versions":["1"],"platform":"pytorch_libtorch","inputs":[{"name":"input__0","datatype":"FP32","shape":[3,32,32]}],"outputs":[{"name":"output__0","datatype":"FP32","shape":[10]}]}

## Make Inference Request

In [3]:
import tritonclient.http as tritonhttpclient
VERBOSE = False
model_label = 'input__0'
input_shape = ( 3, 32, 32)
input_dtype = 'FP32'
output_name = 'output__0'
model_name = 'pt'
url = 'localhost:8000'
model_version = '1'

triton_client = tritonhttpclient.InferenceServerClient(url=url, verbose=VERBOSE)
model_metadata = triton_client.get_model_metadata(model_name=model_name, model_version=model_version)
model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version)


In [6]:
import numpy as np
from torchvision import transforms
from PIL import Image

# preprocessing function
def img_preprocess(img_path="../GTC/img/cat.jpg"):
    img = Image.open(img_path)
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2471, 0.2435, 0.2616]),
    ])
    return preprocess(img).numpy()

transformed_img = img_preprocess()

In [107]:
input0 = tritonhttpclient.InferInput(model_label, transformed_img.shape, datatype="FP32")
input0.set_data_from_numpy(transformed_img, binary_data=False)

output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=False, class_count=10)
response = triton_client.infer(model_name, model_version=model_version, 
                               inputs=[input0], outputs=[output])

In [108]:
output_label = response.as_numpy(output_name)
output_label


array(['4.948598:3', '0.938896:5', '0.684368:7', '-0.359170:4',
       '-2.540236:2', '-3.929959:6', '-6.445944:0', '-6.604012:9',
       '-7.473110:8', '-8.512840:1'], dtype=object)

It has returned the logits in a `logit:class` format. To convert it to a dictionary, we take the following steps

In [7]:
with open("../GTC/cifar10_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]

In [110]:
results = {}
for r in output_label:
    cat = int(r.split(":")[1])
    conf = float(r.split(":")[0])
    results[categories[cat]] = conf
logits = list(results.values())


In [113]:
import torch
import torch.nn.functional as F
logits = torch.tensor(logits)
preds = (F.softmax(logits, dim=-1) * 100).numpy()
preds

In [115]:
for c,k in enumerate(results):
    results[k] = preds[c]

In [116]:
results

{'cat': 96.3503,
 'dog': 1.7476794,
 'horse': 1.3549448,
 'deer': 0.47722018,
 'bird': 0.05388822,
 'frog': 0.013425939,
 'airplane': 0.0010845922,
 'truck': 0.0009260152,
 'ship': 0.00038830572,
 'automobile': 0.00013728552}

## TensorRT

Let's convert it to TensorRT.
For this, we may use the PyTorch container @ NGC located at `nvcr.io/nvidia/pytorch:22.10-py3`

Alternatively, we can (try to) install the pip packages.

In [1]:
# pip install torch-tensorrt==1.2.0 --find-links https://github.com/pytorch/TensorRT/releases/expanded_assets/v1.2.0

In [2]:
import torch
import torch_tensorrt
torch.hub._validate_not_a_forked_repo=lambda a,b,c: True

# load model
model = torch.jit.load("cifar10-script.pt")

# Compile with Torch TensorRT;
trt_model = torch_tensorrt.compile(model,
    inputs= [torch_tensorrt.Input((1, 3, 32, 32))],
    enabled_precisions= { torch.half} # Run with FP32
)

# Save the model
torch.jit.save(trt_model, "model1.pt")

ImportError: /opt/conda/envs/pytorch/lib/python3.9/site-packages/torch_tensorrt/lib/libtorchtrt.so: undefined symbol: _ZN2at4_ops4view4callERKNS_6TensorEN3c108ArrayRefIlEE

Create config file

In [1]:
configuration = """
name: "trt"
platform: "pytorch_libtorch"
max_batch_size : 0
input [
  {
    name: "input__0"
    data_type: TYPE_FP32
    format: FORMAT_NCHW
    dims: [ 3, 32, 32 ]
    reshape { shape: [ 1, 3, 32, 32 ] }
  }
]
output [
  {
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ 10 ]
    reshape { shape: [ 10 ] }
  }
]
"""

with open('/home/ubuntu/work/models/trt/config.pbtxt', 'w') as file:
    file.write(configuration)

In [47]:
!curl -v localhost:8000/v2/models/trt

*   Trying 127.0.0.1:8000...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 8000 (#0)
> GET /v2/models/trt HTTP/1.1
> Host: localhost:8000
> User-Agent: curl/7.68.0
> Accept: */*
> 
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< Content-Type: application/json
< Content-Length: 192
< 
* Connection #0 to host localhost left intact
{"name":"trt","versions":["1"],"platform":"pytorch_libtorch","inputs":[{"name":"input__0","datatype":"FP32","shape":[3,32,32]}],"outputs":[{"name":"output__0","datatype":"FP32","shape":[10]}]}

In [48]:
import numpy as np
from torchvision import transforms
from PIL import Image

# preprocessing function
def img_preprocess(img_path="../GTC/img/cat.jpg"):
    img = Image.open(img_path)
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2471, 0.2435, 0.2616]),
    ])
    return preprocess(img).numpy()

transformed_img = img_preprocess()

In [49]:
import tritonclient.http as tritonhttpclient
VERBOSE = False
model_label = 'input__0'
input_shape = ( 3, 32, 32)
input_dtype = 'FP32'
output_name = 'output__0'
model_name = 'trt'
url = 'localhost:8000'
model_version = '1'

triton_client = tritonhttpclient.InferenceServerClient(url=url, verbose=VERBOSE)
model_metadata = triton_client.get_model_metadata(model_name=model_name, model_version=model_version)
model_config = triton_client.get_model_config(model_name=model_name, model_version=model_version)


In [50]:
input0 = tritonhttpclient.InferInput(model_label, transformed_img.shape, datatype="FP32")
input0.set_data_from_numpy(transformed_img, binary_data=False)

output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=False, class_count=10)
response = triton_client.infer(model_name, model_version=model_version, 
                               inputs=[input0], outputs=[output])

In [51]:
output_label = response.as_numpy(output_name)
output_label


array(['4.949219:3', '0.936035:5', '0.686035:7', '-0.360596:4',
       '-2.539062:2', '-3.929688:6', '-6.449219:0', '-6.601562:9',
       '-7.472656:8', '-8.515625:1'], dtype=object)

In [52]:
results = {}
for r in output_label:
    cat = int(r.split(":")[1])
    conf = float(r.split(":")[0])
    results[categories[cat]] = conf
logits = list(results.values())


In [53]:
import torch
import torch.nn.functional as F
logits = torch.tensor(logits)
preds = (F.softmax(logits, dim=-1) * 100).numpy()
preds

array([9.6355713e+01, 1.7417017e+00, 1.3564386e+00, 4.7627077e-01,
       5.3921040e-02, 1.3421994e-02, 1.0804347e-03, 9.2776271e-04,
       3.8826271e-04, 1.3682646e-04], dtype=float32)

In [54]:
for c,k in enumerate(results):
    results[k] = preds[c]

In [55]:
results

{'cat': 96.35571,
 'dog': 1.7417017,
 'horse': 1.3564386,
 'deer': 0.47627077,
 'bird': 0.05392104,
 'frog': 0.013421994,
 'airplane': 0.0010804347,
 'truck': 0.0009277627,
 'ship': 0.0003882627,
 'automobile': 0.00013682646}