In [1]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import time
from torch2trt import torch2trt

# Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load a pretrained ResNet18 model
model = models.resnet18(pretrained=True).eval().to(device)

# Create a dummy input
x = torch.randn((1, 3, 224, 224)).to(device)

# ========================
# Torch2TRT Conversion
# ========================
model_trt = torch2trt(model, [x])

# ========================
# Timing Utility
# ========================
def time_model(model, x, runs=100):
    with torch.no_grad():
        # Warm-up
        for _ in range(10):
            _ = model(x)

        torch.cuda.synchronize()
        start = time.time()
        for _ in range(runs):
            _ = model(x)
        torch.cuda.synchronize()
        end = time.time()
    return (end - start) / runs * 1000  # ms per inference

# ========================
# Compare Inference Times
# ========================
torch_time = time_model(model, x)
trt_time = time_model(model_trt, x)

print(f"PyTorch Inference Time: {torch_time:.3f} ms")
print(f"TensorRT Inference Time: {trt_time:.3f} ms")
print(f"Speedup: {torch_time / trt_time:.2f}x")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
52.3%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



PyTorch Inference Time: 1.463 ms
TensorRT Inference Time: 0.662 ms
Speedup: 2.21x


In [4]:
import torch
import torchvision.models as models
import time
import onnx
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # Needed for CUDA context

# ========================
# 1. Load PyTorch Model
# ========================
device = torch.device("cuda")
model = models.resnet18(pretrained=True).eval().to(device)

# Dummy input for tracing
x = torch.randn((1, 3, 224, 224)).to(device)

# ========================
# 2. Export to ONNX
# ========================
onnx_file = "resnet18.onnx"
torch.onnx.export(
    model, x, onnx_file,
    input_names=["input"],
    output_names=["output"],
    opset_version=11,
    do_constant_folding=True
)

# Check the exported model
onnx_model = onnx.load(onnx_file)
onnx.checker.check_model(onnx_model)
print("ONNX export succeeded.")

# ========================
# 3. Build TensorRT Engine
# ========================
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine(onnx_file_path, fp16=False):
    with trt.Builder(TRT_LOGGER) as builder, \
         builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \
         trt.OnnxParser(network, TRT_LOGGER) as parser:

        # Parse ONNX
        with open(onnx_file_path, 'rb') as f:
            if not parser.parse(f.read()):
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                raise RuntimeError("ONNX parsing failed")

        # Create config and set workspace size and fp16
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30  # 1 GB

        if fp16:
            config.set_flag(trt.BuilderFlag.FP16)

        return builder.build_engine(network, config)


engine = build_engine(onnx_file,fp16=True)
print("TensorRT engine built.")

# ========================
# 4. Run Inference with TensorRT
# ========================
class TRTModule:
    def __init__(self, engine):
        self.engine = engine
        self.context = engine.create_execution_context()
        self.inputs = []
        self.outputs = []
        self.bindings = []

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding))
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            self.bindings.append(int(device_mem))
            if engine.binding_is_input(binding):
                self.input_shape = engine.get_binding_shape(binding)
                self.inputs.append((host_mem, device_mem))
            else:
                self.outputs.append((host_mem, device_mem))

    def infer(self, input_tensor):
        np.copyto(self.inputs[0][0], input_tensor.ravel())
        cuda.memcpy_htod(self.inputs[0][1], self.inputs[0][0])
        self.context.execute_v2(bindings=self.bindings)
        cuda.memcpy_dtoh(self.outputs[0][0], self.outputs[0][1])
        return self.outputs[0][0]

# ========================
# 5. Benchmark PyTorch vs TensorRT
# ========================
def time_pytorch(model, x, runs=100):
    with torch.no_grad():
        for _ in range(10): model(x)
        torch.cuda.synchronize()
        start = time.time()
        for _ in range(runs):
            model(x)
        torch.cuda.synchronize()
        end = time.time()
    return (end - start) / runs * 1000

def time_tensorrt(trt_model, x, runs=100):
    input_np = x.cpu().numpy().astype(np.float32)
    for _ in range(10): trt_model.infer(input_np)
    start = time.time()
    for _ in range(runs):
        trt_model.infer(input_np)
    end = time.time()
    return (end - start) / runs * 1000

# Run and compare
trt_model = TRTModule(engine)
torch_time = time_pytorch(model, x)
trt_time = time_tensorrt(trt_model, x)

print(f"\nPyTorch Inference Time: {torch_time:.2f} ms")
print(f"TensorRT Inference Time: {trt_time:.2f} ms")
print(f"Speedup: {torch_time / trt_time:.2f}x")




ONNX export succeeded.


  config.max_workspace_size = 1 << 30  # 1 GB
  return builder.build_engine(network, config)


[07/12/2025-20:02:37] [TRT] [W] TensorRT encountered issues when converting weights between types and that could affect accuracy.
[07/12/2025-20:02:37] [TRT] [W] If this is not the desired behavior, please modify the weights or retrain with regularization to adjust the magnitude of the weights.
[07/12/2025-20:02:37] [TRT] [W] Check verbose logs for the list of affected weights.
[07/12/2025-20:02:37] [TRT] [W] - 23 weights are affected by this issue: Detected subnormal FP16 values.
TensorRT engine built.
[07/12/2025-20:02:37] [TRT] [W] - 12 weights are affected by this issue: Detected values less than smallest positive FP16 subnormal value and converted them to the FP16 minimum subnormalized value.

PyTorch Inference Time: 1.47 ms
TensorRT Inference Time: 0.32 ms
Speedup: 4.64x


  size = trt.volume(engine.get_binding_shape(binding))
  dtype = trt.nptype(engine.get_binding_dtype(binding))
  if engine.binding_is_input(binding):
  self.input_shape = engine.get_binding_shape(binding)
