In [None]:
!pip install tensorrt

Collecting tensorrt
  Downloading tensorrt-10.9.0.34.tar.gz (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt_cu12==10.9.0.34 (from tensorrt)
  Downloading tensorrt_cu12-10.9.0.34.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt_cu12_libs==10.9.0.34 (from tensorrt_cu12==10.9.0.34->tensorrt)
  Downloading tensorrt_cu12_libs-10.9.0.34.tar.gz (704 bytes)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tensorrt_cu12_bindings==10.9.0.34 (from tensorrt_cu12==10.9.0.34->tensorrt)
  Downloading tensorrt_cu12_bindings-10.9.0.34-cp311-none-manylinux_2_28_x86_6

1. Model Training and TensorRT Engine Generation

In [None]:
# ========================================
# 🔧 PyTorch Training on CIFAR-10 Dataset
# ========================================
import torch
import torchvision
from torch.utils.data import DataLoader

# Load CIFAR-10 dataset
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)

# Define a simple CNN model
class SimpleCNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(3, 16, 3, padding=1)
        self.conv2 = torch.nn.Conv2d(16, 32, 3, padding=1)
        self.fc = torch.nn.Linear(32*8*8, 10)

    def forward(self, x):
        x = torch.nn.functional.relu(self.conv1(x))
        x = torch.nn.functional.max_pool2d(x, 2)
        x = torch.nn.functional.relu(self.conv2(x))
        x = torch.nn.functional.max_pool2d(x, 2)
        x = x.view(-1, 32*8*8)
        x = self.fc(x)
        return x

# Initialize model and training components
model = SimpleCNN()
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

# Train for 2 epochs
for epoch in range(2):
    for inputs, labels in trainloader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/2 completed")

100%|██████████| 170M/170M [00:03<00:00, 44.5MB/s]


Epoch 1/2 completed
Epoch 2/2 completed


In [None]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [None]:
# ================================
# 📤 Export Trained Model to ONNX
# ================================
torch.onnx.export(
    model,
    torch.randn(1, 3, 32, 32),
    "cifar10.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}
)

In [None]:
# ===============================================
# ⚙️ Convert ONNX to TensorRT Engine (FP16 Mode)
# ===============================================
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)

# Parse ONNX model
with open("cifar10.onnx", "rb") as f:
    if not parser.parse(f.read()):
        print("ONNX parsing errors:")
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        exit(1)

# Configure engine
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.FP16)
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB

# Set dynamic input shapes (batch size)
profile = builder.create_optimization_profile()
input_name = network.get_input(0).name
profile.set_shape(input_name, min=(1, 3, 32, 32), opt=(32, 3, 32, 32), max=(64, 3, 32, 32))
config.add_optimization_profile(profile)

# Build and serialize engine
serialized_engine = builder.build_serialized_network(network, config)

if not serialized_engine:
    print("Engine build failed!")
    exit(1)

runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(serialized_engine)

# Save engine to file
with open("cifar10.engine", "wb") as f:
    f.write(serialized_engine)

print("Engine build successful!")

Engine build successful!


In [None]:
mv /content/cifar10.engine /content/project/cifar10.engine

In [None]:
mv /content/cifar10.onnx /content/project/cifar10.onnx

In [None]:
!pip install fastapi
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.1.2-py3-none-any.whl.metadata (3.0 kB)
Downloading pytools-2025.1.2-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pycuda
  Building wheel for pycuda (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2025.1-cp311-cp311-linux_x86_64.whl size=660425 sha256=b6d4764b2ef07f549546aa812d89f0302ef88be53a1863dbe4000375f11136ec
  Stored in directory: /root/.cache/pip/wheels/77/7e/6c/d2d1451ea6424cdc3d67b36c

2. FastAPI App for Inference

In [None]:
# ===============================
# 📄 FastAPI App (app.py)
# ===============================
# Save this as /content/project/app.py
%%writefile /content/project/app.py
from fastapi import FastAPI, File, UploadFile
import numpy as np
import cv2
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # Automatically initialize CUDA driver

app = FastAPI()

# Load TensorRT engine
def load_engine():
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open("cifar10.engine", "rb") as f:
        runtime = trt.Runtime(TRT_LOGGER)
        return runtime.deserialize_cuda_engine(f.read())

engine = load_engine()
context = engine.create_execution_context()

# Allocate memory
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream()

@app.post("/predict/")
async def predict(file: UploadFile = File(...)):
    img = np.frombuffer(await file.read(), dtype=np.uint8)
    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
    img = cv2.resize(img, (32, 32)).astype(np.float32)
    img = np.transpose(img, (2, 0, 1)) / 255.0
    np.copyto(h_input, img.ravel())

    # Run inference
    cuda.memcpy_htod_async(d_input, h_input, stream)
    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()

    return {"class": int(np.argmax(h_output))}

Overwriting /content/project/app.py


3. Dockerfile to Containerize the App

In [None]:
# ===============================
# 🐳 Dockerfile for FastAPI + TensorRT
# ===============================
# Save this as /content/project/Dockerfile
%%writefile /content/project/Dockerfile
FROM nvcr.io/nvidia/pytorch:23.12-py3

# Set working directory
WORKDIR /app
COPY . .

# Install required packages
RUN apt-get update && apt-get install -y libgl1-mesa-glx
RUN pip install --no-cache-dir fastapi uvicorn numpy opencv-python-headless pycuda tensorrt

# Expose the app port
EXPOSE 8000

# Start the FastAPI server
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

Writing /content/project/Dockerfile


 Zip the project folder for download

In [None]:
!zip -r /content/project.zip /content/project

  adding: content/project/ (stored 0%)
  adding: content/project/Dockerfile (deflated 28%)
  adding: content/project/app.py (deflated 52%)
  adding: content/project/cifar10.onnx (deflated 8%)
  adding: content/project/cifar10.engine (deflated 29%)
