In [None]:
import torch
import torchvision
from torch.utils.data import DataLoader

# Load CIFAR-10 (166 MB) [[8]]
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)

# Define model
class SimpleCNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(3, 16, 3, padding=1)
        self.conv2 = torch.nn.Conv2d(16, 32, 3, padding=1)
        self.fc = torch.nn.Linear(32*8*8, 10)

    def forward(self, x):
        x = torch.nn.functional.relu(self.conv1(x))
        x = torch.nn.functional.max_pool2d(x, 2)
        x = torch.nn.functional.relu(self.conv2(x))
        x = torch.nn.functional.max_pool2d(x, 2)
        x = x.view(-1, 32*8*8)
        x = self.fc(x)
        return x

model = SimpleCNN()
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

# Train for 2 epochs (demo)
for epoch in range(2):
    for inputs, labels in trainloader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/2 completed")

# Export to ONNX [[9]]
torch.onnx.export(
    model,
    torch.randn(1, 3, 32, 32),
    "cifar10.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}
)

# Modified TensorRT section with dynamic shape support
import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, TRT_LOGGER)

# Parse ONNX model
with open("cifar10.onnx", "rb") as f:
    if not parser.parse(f.read()):
        print("ONNX parsing errors:")
        for error in range(parser.num_errors):
            print(parser.get_error(error))
        exit(1)

# Configure TensorRT with dynamic shapes
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.FP16)
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB

# Create optimization profile for dynamic batch size
profile = builder.create_optimization_profile()
input_name = network.get_input(0).name
input_shape = network.get_input(0).shape

# Set min/opt/max dimensions for dynamic axes (batch size in this case)
profile.set_shape(
    input_name,
    min=(1, 3, 32, 32),  # Minimum batch size
    opt=(32, 3, 32, 32),  # Optimal batch size
    max=(64, 3, 32, 32)   # Maximum batch size
)
config.add_optimization_profile(profile)

# Build engine
serialized_engine = builder.build_serialized_network(network, config)

if not serialized_engine:
    print("Engine build failed!")
    exit(1)

# Deserialize and save
runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(serialized_engine)

with open("cifar10.engine", "wb") as f:
    f.write(serialized_engine)

print("Engine build successful!")

In [None]:
%%writefile /content/project/app.py
from fastapi import FastAPI, File, UploadFile
import numpy as np
import cv2
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit  # Automatically initialize CUDA driver

app = FastAPI()

# Load TensorRT engine
def load_engine():
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open("cifar10.engine", "rb") as f:
        runtime = trt.Runtime(TRT_LOGGER)
        return runtime.deserialize_cuda_engine(f.read())

engine = load_engine()
context = engine.create_execution_context()

# Allocate memory
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
stream = cuda.Stream()

@app.post("/predict/")
async def predict(file: UploadFile = File(...)):
    img = np.frombuffer(await file.read(), dtype=np.uint8)
    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
    img = cv2.resize(img, (32, 32)).astype(np.float32)
    img = np.transpose(img, (2, 0, 1)) / 255.0
    np.copyto(h_input, img.ravel())

    cuda.memcpy_htod_async(d_input, h_input, stream)
    context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()

    return {"class": int(np.argmax(h_output))}

In [None]:
%%writefile /content/project/Dockerfile
FROM nvcr.io/nvidia/pytorch:23.12-py3

WORKDIR /app
COPY . .

RUN apt-get update && apt-get install -y libgl1-mesa-glx
RUN pip install --no-cache-dir fastapi uvicorn numpy opencv-python-headless pycuda tensorrt

EXPOSE 8000

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
