In [1]:
import torch
from torch import nn

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()        
        self.conv1 = nn.Sequential(         
            nn.Conv2d(
                in_channels=1,              
                out_channels=16,            
                kernel_size=5,              
                stride=1,                   
                padding=2,                  
            ),                              
            nn.ReLU(),                      
            nn.MaxPool2d(kernel_size=2),    
        )
        self.conv2 = nn.Sequential(         
            nn.Conv2d(16, 32, 5, 1, 2),     
            nn.ReLU(),                      
            nn.MaxPool2d(2),                
        )        # fully connected layer, output 10 classes
        self.out = nn.Linear(32 * 7 * 7, 10)    
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)        # flatten the output of conv2 to (batch_size, 32 * 7 * 7)
        x = x.view(x.size(0), -1)       
        output = self.out(x)
        return output, x    # return x for visualization

In [2]:
# load MNIST dataset
from torchvision import datasets, transforms
import torch.utils.data as Data

train_data = datasets.MNIST(
    root='./mnist',
    train=True,                                     
    transform=transforms.ToTensor(),                
    download=True,                                  
)
test_data = datasets.MNIST(root='./mnist', train=False)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# train model
model = CNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

BATCH_SIZE = 64
train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(5):
    for step, (b_x, b_y) in enumerate(train_loader):
        b_x, b_y = b_x.to(device), b_y.to(device)
        output = model(b_x)[0]               
        loss = loss_func(output, b_y)        
        optimizer.zero_grad()               
        loss.backward()                     
        optimizer.step()                    

        if step % 100 == 0:
            test_output, last_layer = model(test_data.test_data.to(device).view(-1, 1, 28, 28).float())
            pred_y = torch.max(test_output, 1)[1].data.squeeze()
            accuracy = (pred_y == test_data.test_labels.to(device)).sum().item() / float(test_data.test_labels.size(0))
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.cpu().numpy(), '| test accuracy: %.2f' % accuracy)


  return F.conv2d(input, weight, bias, self.stride,


Epoch:  0 | train loss: 2.3012 | test accuracy: 0.10
Epoch:  0 | train loss: 0.1905 | test accuracy: 0.91
Epoch:  0 | train loss: 0.2159 | test accuracy: 0.95
Epoch:  0 | train loss: 0.1983 | test accuracy: 0.96
Epoch:  0 | train loss: 0.1715 | test accuracy: 0.97
Epoch:  0 | train loss: 0.1688 | test accuracy: 0.97
Epoch:  0 | train loss: 0.2440 | test accuracy: 0.97
Epoch:  0 | train loss: 0.0654 | test accuracy: 0.98
Epoch:  0 | train loss: 0.1117 | test accuracy: 0.98
Epoch:  0 | train loss: 0.0451 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0539 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0157 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0749 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0195 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0408 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0159 | test accuracy: 0.98
Epoch:  1 | train loss: 0.1830 | test accuracy: 0.98
Epoch:  1 | train loss: 0.1128 | test accuracy: 0.98
Epoch:  1 | train loss: 0.0917 | test accuracy

### Converse to tensorrt via onnx

In [4]:
import onnx

# generate ONNX model
torch.onnx.export(model, torch.randn(1, 1, 28, 28).to(device), "mnist.onnx", 
                  verbose=True, input_names=['input'], output_names=['output', 'hidden'])
onnx_model = onnx.load("mnist.onnx")

Exported graph: graph(%input : Float(1, 1, 28, 28, strides=[784, 784, 28, 1], requires_grad=0, device=cuda:0),
      %conv1.0.weight : Float(16, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=1, device=cuda:0),
      %conv1.0.bias : Float(16, strides=[1], requires_grad=1, device=cuda:0),
      %conv2.0.weight : Float(32, 16, 5, 5, strides=[400, 25, 5, 1], requires_grad=1, device=cuda:0),
      %conv2.0.bias : Float(32, strides=[1], requires_grad=1, device=cuda:0),
      %out.weight : Float(10, 1568, strides=[1568, 1], requires_grad=1, device=cuda:0),
      %out.bias : Float(10, strides=[1], requires_grad=1, device=cuda:0)):
  %/conv1/conv1.0/Conv_output_0 : Float(1, 16, 28, 28, strides=[12544, 784, 28, 1], requires_grad=0, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1], onnx_name="/conv1/conv1.0/Conv"](%input, %conv1.0.weight, %conv1.0.bias), scope: __main__.CNN::/torch.nn.modules.container.Sequential::conv1/torch.nn.module

The build phase

In [1]:
import tensorrt as trt

# create builder and network
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(EXPLICIT_BATCH)

# parse onnx
parser = trt.OnnxParser(network, logger)
success = parser.parse_from_file("mnist.onnx")

config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20)
profile = builder.create_optimization_profile()
profile.set_shape("input", (1, 1, 28, 28), (1, 1, 28, 28), (1, 1, 28, 28))

# create engine
engine = builder.build_serialized_network(network, config)

with open("mnist.engine", "wb") as f:
    f.write(engine)

[05/25/2024-11:22:31] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


Deserialize a plan

In [2]:
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(engine)

Perform inference

In [12]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

context = engine.create_execution_context()

# populate input buffer
input_shape = (1, 1, 28, 28)
input_data = torch.randn(input_shape).to(device)
context.set_tensor_address("input", input_data.data_ptr())

# populate output buffer
output_shape = (1, 10)
output_data = torch.zeros(output_shape).to(device)
context.set_tensor_address("output", output_data.data_ptr())

# get pointer to CUDA stream
stream = torch.cuda.current_stream().cuda_stream

# start inference
context.execute_v2(
    bindings=[input_data.data_ptr(), output_data.data_ptr()],
    # stream_handle=stream
)

[05/25/2024-11:39:54] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading
[05/25/2024-11:39:55] [TRT] [E] 1: [executionContext.cpp::executeInternal::1011] Error Code 1: Cuda Runtime (an illegal memory access was encountered)


False