In [1]:
# Matthew Giarra <matthew.giarra@jhuapl.edu>

import torch
import torchvision
from torch2trt import torch2trt
import time # for timing execution

In [2]:
# This function times inference for a model
def time_inference(model, data, niter):
    
    # Warm-up inference loop.
    # Inference is often slowest on first iteration.
    # "Warm up" takes care of that.
    with torch.no_grad(): 
        for t in range(3):
            output = model(data)
    
    # Timed inference loop
    tic = time.perf_counter() # Start a timer
    with torch.no_grad():  # torch.no_grad() turns off gradient calculations for faster performance
        for t in range(niter): # Loop niter times
            output = model(data) # RUN THE INFERENCE
    toc = time.perf_counter() # Stop the timer
    exe_sec = toc-tic # Seconds elapsed
    FPS = niter / (exe_sec) # Frames per second
    return exe_sec, FPS

In [6]:
# Make input data
input_tensor_cpu = torch.rand([1,3,224,224])
input_tensor_gpu_full  = input_tensor_cpu.to('cuda')
input_tensor_gpu_half  = input_tensor_gpu_full.half()

In [3]:
# # # # # Get models  # # # # #  

# Pytorch, Float32
model_pytorch_fp32 = torchvision.models.resnet50(pretrained=False).eval().to('cuda')

# Pytorch, Float16
model_pytorch_fp16 = torchvision.models.resnet50(pretrained=False).eval().to('cuda').half()

# # # # # Convert models to TensorRT # # # # #

# TensorRT, Float32
model_trt_fp32 = torch2trt(model_pytorch_fp32, [input_tensor_gpu_full], fp16_mode=False)

# TensorRT, Float16
model_trt_fp16 = torch2trt(model_pytorch_fp16, [input_tensor_gpu_half], fp16_mode=True)

In [4]:
# Vectors to hold results
times_list = []
fps_list = []
names_list = []

# number of iterations per inference trial
niter = 500

# Do all the inferences
sec, fps = time_inference(model_pytorch_fp32, input_tensor_gpu_full, niter); times_list.append(sec); fps_list.append(fps); names_list.append("PyTorch  (FP32)")
sec, fps = time_inference(model_pytorch_fp16, input_tensor_gpu_half, niter); times_list.append(sec); fps_list.append(fps); names_list.append("PyTorch  (FP16)") 
sec, fps = time_inference(model_trt_fp32, input_tensor_gpu_full, niter); times_list.append(sec); fps_list.append(fps); names_list.append("TensorRT (FP32)")
sec, fps = time_inference(model_trt_fp16, input_tensor_gpu_half, niter); times_list.append(sec); fps_list.append(fps); names_list.append("TensorRT (FP16)")

# Print results
print("\nResults summary (%d images)\n===============" % (niter) )
for name, exe_sec, fps in zip(names_list, times_list, fps_list):
    print("%s: %0.2f seconds  (%d FPS),  %0.2fx PyTorch FP32 speed" % (name, exe_sec, fps, fps/fps_list[0]))


Results summary (500 images)
PyTorch  (FP32): 13.06 seconds  (38 FPS),  1.00x PyTorch FP32 speed
PyTorch  (FP16): 7.16 seconds  (69 FPS),  1.82x PyTorch FP32 speed
TensorRT (FP32): 4.24 seconds  (117 FPS),  3.08x PyTorch FP32 speed
TensorRT (FP16): 1.46 seconds  (342 FPS),  8.95x PyTorch FP32 speed


## Current results using Jetson AGX Xavier

### Configuration
- Platform: NVIDIA Jetson AGX Xavier
- Jetpack SDK: 4.4 ([L4T R32.4.3](https://developer.nvidia.com/embedded/jetpack))
- Power mode: [MAXN](https://www.jetsonhacks.com/2018/10/07/nvpmodel-nvidia-jetson-agx-xavier-developer-kit/) (`$ sudo nvpmodel -m 0`) 
- Docker source image: `nvcr.io/nvidia/l4t-ml:r32.4.3-py3`  ([link](https://ngc.nvidia.com/catalog/containers/nvidia:l4t-ml))
- NVIDIA `torch2trt` for converting PyTorch models to TensorRT ([link](https://github.com/NVIDIA-AI-IOT/torch2trt))

### Results
| Model| Execution time (sec) | Throughput (FPS) | Speed-up (vs. PyTorch FP32) |
|:----------:|:----------------------:|:----------:|:--------:|
|   PyTorch  (FP32) |        13.06        |   38   |     1.00 |
|   PyTorch  (FP16) |        7.16        |   69   |     1.82 |
|   TensorRT (FP32) |        4.24        |   117   |     3.08 |
|   TensorRT (FP16) |        1.46        |   342   |     8.95 |

Our results indicate that inference using `Float16` precision on Resnet50 yields about a 2x speedup compared to inference using `Float32` precision in PyTorch. Moreover, converting the model to TensorRT results in speed-ups of 3x and 9x for `Float32` and `Float16`, respectively, compared to `Float32` in PyTorch. This is significant, but still falls far short of the results posted on the [NVIDIA Developer Blog](https://developer.nvidia.com/blog/jetson-xavier-nx-the-worlds-smallest-ai-supercomputer/) (shown below), which are over 5x faster than our best performance here (1941 FPS vs. our 342 FPS for 224x224 images).  

NVIDIA's results use `int8` precision for inference versus `FP16` used by our fastest model. This discrepancy probably accounts for a large fraction of the difference in performance. I need to read more to figure out how to convert models to `int8`.

![Image](https://developer.download.nvidia.com/devblogs/inferencing-performance.png)

## Caveats
The TensorRT models appear not to run with batch sizes greater than 1. The code doesn't crash, but the inference lines execute immediately for any batch size > 1. I verified via `tegrastats` that the device is not running out of memory. I currently don't know the cause of this issue, and it would be interesting to compare inference performance using larger batches.  