# Goal of the POC

This POC aims to profile different nodes of DNN execution, which is a crucial step in evaluating our framework's effectiveness. Initially, we will focus on gathering execution time and memory usage data, storing it in a Profile DB for further downstream processes.

Our initial goal is to isolate nodes within a deployment target device. For example, if the device has a single workstation with a CPU (two cores) and a GPU, we will profile a sample of the DNN forward pass on a layer-by-layer basis for each DNN task model. These profiles will then be stored during the initialization phase as discussed in the development plan.

**Note - This memory usage also means the memory transfers involved.

After familiarizing myself with the PyTorch profiler, I've devised two approaches which I have discussed below.

# First Approach

n this approach, I utilized the latest PyTorch profiler. Despite its recent updates, the profiler still faces limitations, such as the stack_trace and module heiarchy are there are as flags but still do not give proper stack_traces refer [Openissues](https://github.com/pytorch/pytorch/issues/100253)
Due to which I even though I was able to get all the nitty gritty operations in the DNN forward pass profiled I was not able to link it to the named_modules(i.e the main layers involved in it.)

In [10]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset
from torchvision.models import vit_b_16

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Function to select the correct data based on the model type (FFN, CNN, or ViT)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64
    elif model_type == 'ViT':
        # Data for Vision Transformer (ViT) uses images of size 224x224
        input_shape = (64, 3, 224, 224)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Function to format and print profiler events in a table
def print_profiler_events(profiler):
    # Prepare header
    print(f"{'Name':<40} {'Self CPU (ms)':<15} {'CPU Total (ms)':<15} {'CUDA Total (ms)':<15} {'Self CPU Mem (KB)':<20} {'Self CUDA Mem (KB)':<20}")
    print("-" * 150)

    # Initialize cumulative sums for CPU/GPU times and memory
    total_cpu_time = 0
    total_cuda_time = 0
    total_cpu_mem = 0
    total_cuda_mem = 0

    # Iterate through each event and print the results
    for event in profiler.events():
        cpu_time = event.self_cpu_time_total / 1000  # Convert to ms
        cuda_time = event.cuda_time_total / 1000  # Convert to ms
        cpu_mem = (event.self_cpu_memory_usage or 0) / 1024  # Convert to KB
        cuda_mem = (event.self_cuda_memory_usage or 0) / 1024  # Convert to KB

        # Add to cumulative sums
        total_cpu_time += cpu_time
        total_cuda_time += cuda_time
        total_cpu_mem += cpu_mem
        total_cuda_mem += cuda_mem

        # Print each event in tabular format
        print(f"{event.name:<40} {cpu_time:<15.2f} {event.cpu_time_total / 1000:<15.2f} {cuda_time:<15.2f} {cpu_mem:<20.2f} {cuda_mem:<20.2f}")

    # Print cumulative totals
    print("-" * 150)
    print(f"{'Total':<40} {total_cpu_time:<15.2f} {'-':<15} {total_cuda_time:<15.2f} {total_cpu_mem:<20.2f} {total_cuda_mem:<20.2f}")
    print("-" * 150)

# Function to profile the forward pass and print all recorded events in tabular format
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True, 
        with_stack=False,      
        with_flops=True,
        with_modules=True
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            if step >= 1:
                break  # Limit to one batch
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            # Synchronize CUDA operations before stepping profiler
            if device == 'cuda':
                torch.cuda.synchronize()

    # Print the profiling events in a table format
    print_profiler_events(profiler)

# Run profiling on CPU and GPU for FFN, CNN, and ViT
def main():
    for model_type in ['CNN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            # Using the extended FFN with 6 hidden layers
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=2)
        elif model_type == 'CNN':
            model = CNN()
        elif model_type == 'ViT':
            # Load the Vision Transformer (ViT) model from torchvision
            model = vit_b_16(pretrained=False)

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling CNN on CPU...
Name                                     Self CPU (ms)   CPU Total (ms)  CUDA Total (ms) Self CPU Mem (KB)    Self CUDA Mem (KB)  
------------------------------------------------------------------------------------------------------------------------------------------------------
aten::empty                              0.01            0.01            0.00            0.01                 0.00                
aten::random_                            0.01            0.01            0.00            0.00                 0.00                
aten::item                               0.01            0.01            0.00            0.00                 0.00                
aten::_local_scalar_dense                0.00            0.00            0.00            0.00                 0.00                
[memory]                                 0.00            0.00            0.00            -0.01                0.00                
enumerate(DataLoader)#_SingleProcessDat

  cuda_time = event.cuda_time_total / 1000  # Convert to ms
  cuda_mem = (event.self_cuda_memory_usage or 0) / 1024  # Convert to KB


Profiling CNN on GPU...
Name                                     Self CPU (ms)   CPU Total (ms)  CUDA Total (ms) Self CPU Mem (KB)    Self CUDA Mem (KB)  
------------------------------------------------------------------------------------------------------------------------------------------------------
aten::empty                              0.02            0.02            0.00            0.01                 0.00                
aten::random_                            0.02            0.02            0.00            0.00                 0.00                
aten::item                               0.01            0.01            0.00            0.00                 0.00                
aten::_local_scalar_dense                0.00            0.00            0.00            0.00                 0.00                
[memory]                                 0.00            0.00            0.00            -0.01                0.00                
enumerate(DataLoader)#_SingleProcessDat

# Second approach

In this approach I have used the older methods of hooks to manually calculate the profiling time required on each layer,but still needs some more refining to do ,as it does not still correctly capture cuda timings.

In [2]:
import torch
import torch.nn as nn
import time
from collections import defaultdict
from torchvision.models import vit_b_16

class Profiler:
    def __init__(self, model):
        self.model = model
        self.execution_times = defaultdict(float)
        self.memory_usage = defaultdict(lambda: {'input': 0, 'output': 0})
        self.start_times = {}
        self._register_hooks()

    def _register_hooks(self):
        for name, layer in self.model.named_modules():
            # Skip the root module to avoid duplicate entries
            if not name:
                continue
            # Register forward pre-hook to capture start time and input memory
            layer.register_forward_pre_hook(self._get_pre_hook(name))
            # Register forward hook to capture end time and output memory
            layer.register_forward_hook(self._get_forward_hook(name))

    def _get_pre_hook(self, layer_name):
        def pre_hook(module, input):
            # Record the start time
            self.start_times[layer_name] = time.time()
            # Record memory usage for input tensors
            self.memory_usage[layer_name]['input'] = sum(
                i.element_size() * i.nelement() for i in input if torch.is_tensor(i))
        return pre_hook

    def _get_forward_hook(self, layer_name):
        def forward_hook(module, input, output):
            # Synchronize CUDA to ensure accurate timing
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            # Record the end time and calculate execution time
            start_time = self.start_times.pop(layer_name, None)
            if start_time is not None:
                exec_time = time.time() - start_time
                self.execution_times[layer_name] += exec_time
            else:
                print(f"Warning: Start time for {layer_name} not recorded.")

            # Record memory usage for output tensors
            if isinstance(output, torch.Tensor):
                self.memory_usage[layer_name]['output'] = output.element_size() * output.nelement()
            elif isinstance(output, (tuple, list)):
                self.memory_usage[layer_name]['output'] = sum(
                    o.element_size() * o.nelement() for o in output if torch.is_tensor(o))
        return forward_hook

    def print_report(self):
        print("Layer-by-Layer Profiling Report:")
        print(f"{'Layer':<30} {'Execution Time (s)':<20} {'Input Memory (bytes)':<20} {'Output Memory (bytes)':<20}")
        for layer_name in self.execution_times:
            exec_time = self.execution_times[layer_name]
            input_mem = self.memory_usage[layer_name]['input']
            output_mem = self.memory_usage[layer_name]['output']
            print(f"{layer_name:<30} {exec_time:<20.6f} {input_mem:<20} {output_mem:<20}")

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = x.view(-1, 64 * 7 * 7)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x



class ViT(nn.Module):
    def __init__(self):
        super(ViT, self).__init__()
        self.vit = vit_b_16(pretrained=True)

    def forward(self, x):
        x = self.vit(x)
        return x

# Instantiate model and profiler
model = ViT()
profiler = Profiler(model)

# # Run the profiler on a sample input
# input_shape = (64, 1, 28, 28) 
# inputs = torch.randn(input_shape)
# output = model(inputs)

# # Print the profiling report
# profiler.print_report()


#  size for ViT models is 224x224
input_shape = (64, 3, 224, 224)  # Batch size of 64, 3 color channels (RGB)
inputs = torch.randn(input_shape)
output = model(inputs)

# Print the profiling report
profiler.print_report()


Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:02<00:00, 125MB/s]


Layer-by-Layer Profiling Report:
Layer                          Execution Time (s)   Input Memory (bytes) Output Memory (bytes)
vit.conv_proj                  0.457063             38535168             38535168            
vit.encoder.dropout            0.000500             38731776             38731776            
vit.encoder.layers.encoder_layer_0.ln_1 0.038357             38731776             38731776            
vit.encoder.layers.encoder_layer_0.self_attention 1.594063             116195328            38731776            
vit.encoder.layers.encoder_layer_0.dropout 0.000093             38731776             38731776            
vit.encoder.layers.encoder_layer_0.ln_2 0.057830             38731776             38731776            
vit.encoder.layers.encoder_layer_0.mlp.0 0.951166             38731776             154927104           
vit.encoder.layers.encoder_layer_0.mlp.1 0.206404             154927104            154927104           
vit.encoder.layers.encoder_layer_0.mlp.2 0.000077  

## Which method should I follow I intend to get the results somewhat like  [torchprof](https://pypi.org/project/torchprof/).But the library is depreceated as it uses older versions of PyTorch.