In [1]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import time
from PIL import Image


# Used to load the pre-trained model (Faster R-CNN)
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image
from torch import Tensor
import torchvision.transforms.functional as F

In [2]:
@dataclass(frozen=True)
class ModuleInferenceTime:
    inference_time: float
    module_name: str


In [3]:
def profile_layer_wise(module_name: str, 
                       profiling_results: list[ModuleInferenceTime], 
                       device: torch.device,
                       func: callable) -> callable:
    def wrapper(*args, **kwargs):
        # Synchronize GPU operations if available
        if device.type == 'cuda':
            torch.cuda.synchronize()
        
        start_time = time.time()
        
        # Call the original layer function
        output = func(*args, **kwargs)

        if device.type == 'cuda':
            torch.cuda.synchronize()
        end_time = time.time()
        
        # Calculate elapsed time in milliseconds
        elapsed_time = (end_time - start_time) * 1000

        profiling_results.append(
            ModuleInferenceTime(inference_time=elapsed_time,
                                module_name=module_name)
        )
        
        return output
    return wrapper

def pretty_description(original_code_name: str, module_description: str):
    return original_code_name + "\n" + module_description


# Create a function to apply the decorator to all layers of a model
def apply_profiling_decorator(model: nn.Module, 
                              profiling_results: list[ModuleInferenceTime],
                              device: torch.device,
                              recursive: bool=False):
    # Iterate over all the modules (layers) in the model
    for name, module in model.named_children():
        original_code_name = name

        if isinstance(module, nn.Module):
            module_description = str(module)
            full_description = pretty_description(original_code_name, module_description)
            module.forward = profile_layer_wise(full_description, 
                                                profiling_results, 
                                                device, 
                                                module.forward)
            if recursive:
                apply_profiling_decorator(module, profiling_results, device)


def small_warmup(model: nn.Module, device: torch.device, num_iterations: int = 10):
    # Warm-up the
    model.eval()
    with torch.no_grad():
        for _ in range(num_iterations):
            model(torch.rand(1, 3, 224, 224).to(device))


def get_mean_inference_time_per_layer(profiling_results: list[list[ModuleInferenceTime]]) -> list[ModuleInferenceTime]:
    mean_inference_times = []
    mapping = {}
    for profiling_result in profiling_results:
        for module_inference_time in profiling_result:
            if module_inference_time.module_name not in mapping:
                mapping[module_inference_time.module_name] = []
            mapping[module_inference_time.module_name].append(module_inference_time.inference_time)
    
    for module_name, inference_times in mapping.items():
        mean_inference_time = sum(inference_times) / len(inference_times)
        mean_inference_times.append(ModuleInferenceTime(inference_time=mean_inference_time,
                                                        module_name=module_name))
    return mean_inference_times


# Load the model (Faster-RCNN) and preprocessing (Original)

### Link to the pre-processing: https://github.com/pytorch/vision/blob/main/torchvision/transforms/_presets.py

#### Note, that in the original implementation of authors, each model has the preprocessing, i.e, transforms inside the model, thus it is not needed to additionally add it here. 

In [4]:
class ObjectDetectionPreprocess(nn.Module):
    def forward(self, img: Tensor) -> Tensor:
        if not isinstance(img, Tensor):
            img = F.pil_to_tensor(img)
        return F.convert_image_dtype(img, torch.float)

    def __repr__(self) -> str:
        return self.__class__.__name__ + "()"

    def describe(self) -> str:
        return (
            "Accepts ``PIL.Image``, batched ``(B, C, H, W)`` and single ``(C, H, W)`` image ``torch.Tensor`` objects. "
            "The images are rescaled to ``[0.0, 1.0]``."
        )


In [5]:
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
preprocess: callable = ObjectDetectionPreprocess()

## Implement Custom dataloader

In [6]:
import os
from torch.utils.data import Dataset, DataLoader

# InferenceDataset: Only loads images without annotations
class InferenceDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_filenames = os.listdir(image_dir)  # Get list of image filenames
        self.transform = transform

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        # Load the image
        img_path = os.path.join(self.image_dir, self.image_filenames[idx])
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, self.image_filenames[idx]  # Return the image and filename for reference

# Create a DataLoader/Dataset for the inference only
inference_dataset = InferenceDataset(image_dir="./data", transform=preprocess)
inference_dataloader = DataLoader(inference_dataset, batch_size=1, shuffle=False)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
model.to(device)

for batch, filenames in inference_dataloader:
    batch = batch.to(device)
    outputs = model(batch)
    boxes = []
    for i, output in enumerate(outputs):
        labels = [weights.meta["categories"][i] for i in output["labels"]]
        box = draw_bounding_boxes(batch[i], boxes=output["boxes"],
                            labels=labels,
                            colors="red",
                            width=4, font_size=30)
        boxes.append(box)
        
box = boxes[0]
im = to_pil_image(box.detach())
im.save("output.png")



## Finally, profiling the network - CPU

Non-recursive (for child modules) for now

In [8]:
# Per image
profiling_results_cpu_list: list[list[ModuleInferenceTime]] = []

In [9]:
device = torch.device("cpu")

model.eval()
model.to(device)
small_warmup(model, device)

profiling_results_cpu_list = []

for batch, filenames in inference_dataloader:
    profiling_results_cpu = []
    apply_profiling_decorator(model, profiling_results_cpu, device)
    batch = batch.to(device)
    start_time = time.time()
    outputs = model(batch)
    end_time = time.time()
    profiling_results_cpu_list.append(profiling_results_cpu)

profiling_results_cpu = get_mean_inference_time_per_layer(profiling_results_cpu_list)
print("Last Total inference time: ", (end_time - start_time) * 1e3, "ms")

Last Total inference time:  689.4416809082031 ms


Getting transform info from the model

In [10]:
results_transform = list(filter(lambda x: "transform" in x.module_name, profiling_results_cpu))

for result in results_transform:
    print(result.module_name, result.inference_time)


transform
GeneralizedRCNNTransform(
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    Resize(min_size=(800,), max_size=1333, mode='bilinear')
) 0.6180604298909506


Getting pure modular info from model

In [11]:
results_cpu = list(filter(lambda x: "transform" not in x.module_name, profiling_results_cpu))

print("Inference times for all layers, as a SUM:", sum([result.inference_time for result in results_cpu]))
for result in results_cpu:
    print(result.module_name, result.inference_time)

Inference times for all layers, as a SUM: 686.8317127227783
backbone
BackboneWithFPN(
  (body): IntermediateLayerGetter(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_

## Finally, profiling the network - GPU

Recursive (for child modules) - the sum inference time per operation is expected to be higher than total

In general, the best approach will be to visualize it as a graph of operations (submodules), but it requires additional libs

In [12]:
profiling_results_gpu_list: list[list[ModuleInferenceTime]] = []

In [13]:
device = torch.device("cuda")
model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)

model.eval()
model.to(device)
small_warmup(model, device)

# Add recursive=True to profile all layers

for batch, filenames in inference_dataloader:
    profiling_results_gpu = []
    apply_profiling_decorator(model, profiling_results_gpu, device, recursive=True)
    batch = batch.to(device)
    start_time = time.time()
    outputs = model(batch)
    # Important to synchronize CUDA operations
    torch.cuda.synchronize()
    end_time = time.time()
    profiling_results_gpu_list.append(profiling_results_gpu)

profiling_results_gpu = get_mean_inference_time_per_layer(profiling_results_gpu_list)
print("Total inference time: ", (end_time - start_time) * 1e3, "ms")

Total inference time:  21.309375762939453 ms


In [14]:
results_gpu = list(filter(lambda x: "transform" not in x.module_name, profiling_results_gpu))

In [15]:
print("Cumulative sum for layers path, as a SUM:", sum([result.inference_time for result in results_gpu]))


Cumulative sum for layers path, as a SUM: 37.78918584187826


In [16]:
for result in results_gpu:
    print(result.module_name, result.inference_time)

body
IntermediateLayerGetter(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_si

# Summary

This approach allows profiling information to be post-processed and used in various table formats.

### Pros:
- Works with any model based on `nn.Module`.
- Easily customizable profiling wrappers.

### Cons:
- Accurately measuring GPU time using Python alone is challenging due to the need for `cuda.synchronize()`, which introduces overhead.
- If `model.forward` calls functions not registered as `nn.Module`, their execution time won't be captured. To address this, consider using `torch.profiler` or `cProfile`.
