In [None]:
import torch
import torch.nn as nn
import torch.profiler
import os
import psutil  # For detecting CPU cores
import pandas as pd  # For displaying the results in a tabular format

# Define a simple model for profiling
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(100, 50)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Create a model instance and generate input data
model = SimpleModel()
input_data = torch.randn(1, 100)

# Function to set CPU affinity (bind the task to a specific CPU core)
def set_cpu_affinity(core_id):
    pid = os.getpid()
    os.sched_setaffinity(pid, {core_id})

# Function to reset CPU affinity (allow the system to use all available cores)
def reset_cpu_affinity():
    pid = os.getpid()
    os.sched_setaffinity(pid, set(range(psutil.cpu_count(logical=True))))  # Reset to all available cores

# Profiling function for the model
def profile_model(model, input_data, device_type="cpu", core_id=None):
    if device_type == "cpu" and core_id is not None:
        set_cpu_affinity(core_id)  # Set CPU affinity for specific core
    elif device_type == "gpu":
        reset_cpu_affinity()  # Reset CPU affinity for GPU profiling
        model = model.cuda()  # Move the model to the GPU
        input_data = input_data.cuda()  # Move the input data to the GPU

    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(wait=1, warmup=0, active=3, repeat=1),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
    ) as prof:
        # Synchronize before starting the profiling
        if device_type == "gpu":
            torch.cuda.synchronize()

        # Forward pass
        for _ in range(5):
            with torch.profiler.record_function("model_inference"):
                output = model(input_data)

            # Synchronize the GPU to ensure that profiling catches all operations
            if device_type == "gpu":
                torch.cuda.synchronize()
            prof.step()  # Ensure the profiler steps through the operations

    # Extract the profiling information into a list of dictionaries
    result = []
    for event in prof.key_averages():
        result.append({
            "Name": event.key,
            "Self CPU time": event.self_cpu_time_total,
            "Total CPU time": event.cpu_time_total,
            "Self CUDA time": getattr(event, "self_cuda_time_total", "-"),  # Handle missing attribute
            "Total CUDA time": getattr(event, "cuda_time_total", "-"),  # Handle missing attribute
            "CPU Memory Usage": getattr(event, "cpu_memory_usage", "-"),  # Handle missing attribute
            "CUDA Memory Usage": getattr(event, "cuda_memory_usage", "-"),  # Handle missing attribute
        })
    return result

# Automatically detect the number of CPU cores (performance cores)
def get_num_cpu_cores():
    return psutil.cpu_count(logical=False)  # Number of physical cores (not hyperthreads)

# Automatically detect the number of GPUs available
def get_num_gpus():
    return torch.cuda.device_count()  # Number of available GPUs

# Main profiling function
def main():
    num_cpu_cores = get_num_cpu_cores()  # Get number of CPU cores
    num_gpus = get_num_gpus()  # Get number of GPUs

    print(f"Detected {num_cpu_cores} CPU cores")
    print(f"Detected {num_gpus} GPU(s)")

    profiling_results = []

    # Profile on each CPU core
    for core_id in range(num_cpu_cores):
        print(f"\nProfiling on CPU core {core_id}...")
        cpu_results = profile_model(model, input_data, device_type="cpu", core_id=core_id)
        for result in cpu_results:
            profiling_results.append({
                "Node": f"CPU Core {core_id}",
                "Layer": result["Name"],
                "WCET (CPU)": f"{result['Total CPU time'] / 1e3:.4f} ms" if result['Total CPU time'] else "-",
                "WCET (GPU)": "-",
                "CPU Memory": f"{result['CPU Memory Usage'] / 1024:.1f} KB" if isinstance(result['CPU Memory Usage'], (int, float)) else "-",
                "GPU Memory": "-"
            })

    # Profile on each GPU (if available)
    if num_gpus > 0:
        for gpu_id in range(num_gpus):
            print(f"\nProfiling on GPU {gpu_id}")
            gpu_results = profile_model(model, input_data, device_type="gpu")
            for result in gpu_results:
                profiling_results.append({
                    "Node": f"GPU {gpu_id}",
                    "Layer": result["Name"],
                    "WCET (CPU)": "-",
                    "WCET (GPU)": f"{result['Total CUDA time'] / 1e3:.4f} ms" if isinstance(result['Total CUDA time'], (int, float)) else "-",
                    "CPU Memory": "-",
                    "GPU Memory": f"{result['CUDA Memory Usage'] / 1024:.1f} KB" if isinstance(result['CUDA Memory Usage'], (int, float)) else "-"
                })

    # Convert profiling results to a pandas DataFrame for display
    df_profiling = pd.DataFrame(profiling_results)

    # Display the results
    print("\nProfiling results:")
    print(df_profiling)

    # Optionally, you can save the profiling results to a CSV file
    df_profiling.to_csv("profiling_results.csv", index=False)

# Run the main function
if __name__ == "__main__":
    main()


Detected 1 CPU cores
Detected 1 GPU(s)

Profiling on CPU core 0...

Profiling on GPU 0

Profiling results:
          Node                                              Layer WCET (CPU)  \
0   CPU Core 0                                      ProfilerStep*  0.9285 ms   
1   CPU Core 0                                    model_inference  0.7666 ms   
2   CPU Core 0                                       aten::linear  0.2660 ms   
3   CPU Core 0                                            aten::t  0.0857 ms   
4   CPU Core 0                                    aten::transpose  0.0411 ms   
5   CPU Core 0                                   aten::as_strided  0.0147 ms   
6   CPU Core 0                                        aten::addmm  0.1564 ms   
7   CPU Core 0                                       aten::expand  0.0192 ms   
8   CPU Core 0                                        aten::copy_  0.0245 ms   
9   CPU Core 0                                 aten::resolve_conj  0.0017 ms   
10  CPU Core 

  warn("Profiler won't be using warmup, this can skew profiler results")


In [None]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Step 1: Define a simple Feedforward Neural Network
class FFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FFN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Set up data for the network
input_size = 784  # Example input size (like for MNIST dataset)
hidden_size = 128
output_size = 10  # Number of output classes

model = FFN(input_size, hidden_size, output_size)

# Generate some random data for demonstration
batch_size = 64  # Increased batch size
inputs = torch.randn(batch_size, input_size)
targets = torch.randint(0, output_size, (batch_size,))

# Create a function to profile each layer on CPU/GPU
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        record_shapes=True,
        profile_memory=True,  # Enable memory profiling
        with_stack=True,
        with_flops=True  # Enable FLOP counting
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = nn.CrossEntropyLoss()(outputs, targets)
            loss.backward()

            # Ensure all GPU work is done before stepping the profiler
            torch.cuda.synchronize()  # Synchronize CUDA events before logging
            profiler.step()

    # Print profiling results
    for event in profiler.events():
          print(f"Name: {event.name}, CPU Time: {event.cpu_time_total}us, "
                f"CUDA Time: {event.cuda_time_total}us, Memory Used: {event.self_cpu_memory_usage} bytes")

# Step 4: Run profiling on CPU and GPU
def main():
    dataset = TensorDataset(inputs, targets)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    # Profile on CPU
    print("Profiling on CPU...")
    profile_layer_execution(model, device='cpu', dataloader=dataloader)

    # Profile on GPU
    if torch.cuda.is_available():
        print("Profiling on GPU...")
        profile_layer_execution(model, device='cuda', dataloader=dataloader)

if __name__ == "__main__":
    main()


Profiling on CPU...
Name: cudaDeviceSynchronize, CPU Time: 16.489000000000033us, CUDA Time: 0us, Memory Used: 0 bytes
Profiling on GPU...
Name: cudaDeviceSynchronize, CPU Time: 9.926000000000045us, CUDA Time: 0us, Memory Used: 0 bytes


  f"CUDA Time: {event.cuda_time_total}us, Memory Used: {event.self_cpu_memory_usage} bytes")


In [None]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Step 1: Define a Convolutional Neural Network
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x


# Define a simple Feedforward Neural Network (FFN)
class FFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FFN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Step 2: Set up data for the FFN
batch_size = 64
input_size = 100  # Example input size
hidden_size = 50
output_size = 10

# Random input data and target labels
inputs = torch.randn(batch_size, input_size)
targets = torch.randint(0, output_size, (batch_size,))


# Step 2: Set up data for the CNN
batch_size = 64  # Larger batch size to increase workload
input_shape = (batch_size, 1, 28, 28)  # Example input size (like MNIST)

inputs = torch.randn(input_shape)  # Random input data
targets = torch.randint(0, 10, (batch_size,))  # Random target labels

# Step 3: Create a function to profile each layer on CPU/GPU
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
        record_shapes=True,
        profile_memory=True,  # Enable memory profiling
        with_stack=False,
        with_flops=True  # Enable FLOP counting
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            if step >= 1:
                break  # Limit to one batch for simplicity
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = nn.CrossEntropyLoss()(outputs, targets)
            loss.backward()

            # Synchronize CUDA operations before stepping profiler
            if device == 'cuda':
                torch.cuda.synchronize()
            profiler.step()

    # Print profiling results
    print(profiler.key_averages().table(
        sort_by="cuda_time_total", row_limit=10))

# Step 4: Run profiling on CPU and GPU
def main():
    dataset = TensorDataset(inputs, targets)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    model = CNN()

    # Profile on CPU
    print("Profiling on CPU...")
    profile_layer_execution(model, device='cpu', dataloader=dataloader)

    # Profile on GPU
    if torch.cuda.is_available():
        print("Profiling on GPU...")
        profile_layer_execution(model, device='cuda', dataloader=dataloader)
    else:
        print("CUDA is not available on this system.")

if __name__ == "__main__":
    main()

Profiling on CPU...
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total KFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         2.65%       1.342ms        99.94%      50.674ms      50.674ms       1.80 Mb      -9.22 Mb             1            --  
                                            aten::empty         0.20%     102.767us         0.20%     102.767us       7.905us      10.85 Mb      10.85 Mb            13            --  
                                          aten::random_     

  warn("Profiler won't be using warmup, this can skew profiler results")


In [None]:
import torch
import torch.profiler
import torchvision.models as models

# Step 1: Set up the model and dummy input data
model = models.resnet18(pretrained=False)  # Using a ResNet18 model
model.eval()  # Set the model to evaluation (inference) mode
input_data = torch.randn(1, 3, 224, 224)  # Dummy input with batch size 1, image size 224x224

# Step 2: Define the profiler with layer-wise profiling and output to TensorBoard
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],  # Profile on CPU and GPU (if available)
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3,repeat=1),  # Profile immediately (no warmup, active for 1 step)
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),  # Save traces to TensorBoard for visualization
    record_shapes=True,  # Record shapes of the input tensors
    profile_memory=True,  # Track memory usage
    with_stack=True  # Capture stack traces to see which parts of the code are running
) as profiler:
    # Step 3: Run the inference
    with torch.no_grad():  # No need to compute gradients during inference
        for _ in range(10):  # Run 10 inference steps
            output = model(input_data)
            profiler.step()  # Step the profiler after each iteration

# Step 4: Visualize or print profiling results
print(profiler.key_averages().table(sort_by="cpu_time_total", row_limit=10))


---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                    ProfilerStep*         5.71%      10.747ms        99.98%     188.321ms      62.774ms       1.54 Mb     -57.62 Mb             3  
                     aten::conv2d         0.19%     362.201us        74.50%     140.323ms       2.339ms      28.42 Mb           0 b            60  
                aten::convolution         0.24%     456.707us        74.30%     139.961ms       2.333ms      28.42 Mb           0 b            60  
               aten::_convolution         0.31%     581.752us        74.06%     139.504ms       2.325ms      28.

In [None]:
!tensorboard --logdir=./log

2024-09-10 07:46:42.662334: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 07:46:42.682614: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 07:46:42.688587: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-10 07:46:42.703320: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1725954405.181061    2914 cuda_ex

In [None]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Define a simple Feedforward Neural Network (FFN)
class FFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FFN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Function to profile the forward pass on CPU/GPU
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,  # Enable memory profiling
        with_stack=False,
        with_flops=True  # Enable FLOP counting
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            if step >= 1:
                break  # Limit to one batch for simplicity
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            # Synchronize CUDA operations before stepping profiler
            if device == 'cuda':
                torch.cuda.synchronize()

    # Print profiling results (sorted by CUDA time to capture copies and forward pass)
    print(profiler.key_averages().table(
        sort_by="cuda_time_total"))

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['FFN', 'CNN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            model = FFN(input_size=100, hidden_size=50, output_size=10)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling FFN on CPU...
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total KFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         0.41%      13.053us         0.41%      13.053us      13.053us           8 b           8 b             1            --  
                                          aten::random_         1.30%      41.431us         1.30%      41.431us      41.431us           0 b           0 b             1            --  
                                             aten::item 

In [None]:
# Python program to explain os.sched_setaffinity() method

# importing os module
import os

# Get the number of CPUs
# in the system
# using os.cpu_count() method
print("Number of CPUs:", os.cpu_count())

# Get the set of CPUs
# on which the calling process
# is eligible to run. using
# os.sched_getaffinity() method
# 0 as PID represents the
# calling process
pid = 0
affinity = os.sched_getaffinity(pid)

# Print the result
print("Process is eligible to run on:", affinity)




Number of CPUs: 2
Process is eligible to run on: {0, 1}


In [None]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Define a simple Feedforward Neural Network (FFN)
class FFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FFN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Function to profile the forward pass
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,  # Enable memory profiling
        with_stack=True,      # Enable stack traces to trace back to source code
        with_flops=True
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            if step >= 1:
                break  # Limit to one batch for simplicity
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            # Synchronize CUDA operations before stepping profiler
            if device == 'cuda':
                torch.cuda.synchronize()

    # Print the profiling results
    # print(profiler.key_averages(group_by_input_shape = True, group_by_stack_n = 1).table(
    #     sort_by="cuda_time_total"))
    # print(profiler.action_map)

    # Print stack trace if needed
    # for event in profiler.key_averages():
    #     print(f"\nStack Trace for {event.key}:")
    #     print(event.stack)
    #     print("-" * 80)

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['FFN', 'CNN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            model = FFN(input_size=100, hidden_size=50, output_size=10)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling FFN on CPU...
{(<ProfilerAction.NONE: 0>, <ProfilerAction.NONE: 0>): [], (<ProfilerAction.NONE: 0>, <ProfilerAction.WARMUP: 1>): [<bound method _KinetoProfile.prepare_trace of <torch.profiler.profiler.profile object at 0x7e9eaadc7b20>>], (<ProfilerAction.NONE: 0>, <ProfilerAction.RECORD: 2>): [<bound method _KinetoProfile.prepare_trace of <torch.profiler.profiler.profile object at 0x7e9eaadc7b20>>, <bound method _KinetoProfile.start_trace of <torch.profiler.profiler.profile object at 0x7e9eaadc7b20>>], (<ProfilerAction.NONE: 0>, <ProfilerAction.RECORD_AND_SAVE: 3>): [<bound method _KinetoProfile.prepare_trace of <torch.profiler.profiler.profile object at 0x7e9eaadc7b20>>, <bound method _KinetoProfile.start_trace of <torch.profiler.profiler.profile object at 0x7e9eaadc7b20>>], (<ProfilerAction.WARMUP: 1>, <ProfilerAction.NONE: 0>): [functools.partial(<built-in function warn>, 'Incorrect schedule: WARMUP followed by NONE'), <bound method _KinetoProfile.start_trace of <torch.pro

In [None]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Function to profile the forward pass and print all recorded events
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,  # Enable memory profiling
        with_stack=True,      # Enable stack traces to trace back to source code
        with_flops=True
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            if step >= 1:
                break  # Limit to one batch for simplicity
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            # Synchronize CUDA operations before stepping profiler
            if device == 'cuda':
                torch.cuda.synchronize()

    # Print each event recorded in the profiler
    for event in profiler.events():
        print(f"Name: {event.name}, CPU Time: {event.cpu_time_total:.4f}us, "
              f"CUDA Time: {event.cuda_time_total:.4f}us, "
              f"Memory Used: {event.self_cpu_memory_usage} bytes")
        # Optionally, print the stack trace for each event
        print(f"Stack Trace for {event.name}: {event.stack}")
        print("-" * 80)

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['FFN', 'CNN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            # Using the extended FFN with 6 hidden layers
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=2)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling FFN on CPU...
Name: aten::empty, CPU Time: 13.9270us, CUDA Time: 0.0000us, Memory Used: 8 bytes
Stack Trace for aten::empty: []
--------------------------------------------------------------------------------
Name: aten::random_, CPU Time: 13.7860us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::random_: []
--------------------------------------------------------------------------------
Name: aten::item, CPU Time: 10.9340us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::item: []
--------------------------------------------------------------------------------
Name: aten::_local_scalar_dense, CPU Time: 3.1510us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::_local_scalar_dense: []
--------------------------------------------------------------------------------
Name: [memory], CPU Time: 0.0000us, CUDA Time: 0.0000us, Memory Used: -8 bytes
Stack Trace for [memory]: []
-----------------------------------------------------------

  f"CUDA Time: {event.cuda_time_total:.4f}us, "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Stack Trace for aten::as_strided: []
--------------------------------------------------------------------------------
Name: aten::select, CPU Time: 2.6950us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::select: []
--------------------------------------------------------------------------------
Name: aten::as_strided, CPU Time: 0.3910us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::as_strided: []
--------------------------------------------------------------------------------
Name: aten::select, CPU Time: 2.7080us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::select: []
--------------------------------------------------------------------------------
Name: aten::as_strided, CPU Time: 0.4420us, CUDA Time: 0.0000us, Memory Used: 0 bytes
Stack Trace for aten::as_strided: []
--------------------------------------------------------------------------------
Name: aten::selec

In [None]:
import torch
import torch.profiler
import torchvision.models as models
import pandas as pd

# Step 1: Set up the model and dummy input data
model = models.resnet18(pretrained=False)  # Using a ResNet18 model
model.eval()  # Set the model to evaluation (inference) mode
input_data = torch.randn(1, 3, 224, 224)  # Dummy input with batch size 1, image size 224x224

# Step 2: Create a dictionary to map each layer type to a unique index
layer_counts = {}

# Helper function to generate unique layer names
def get_unique_layer_name(layer):
    layer_type = layer.__class__.__name__
    if layer_type not in layer_counts:
        layer_counts[layer_type] = 0
    layer_counts[layer_type] += 1
    return f"{layer_type}-{layer_counts[layer_type]}"

# Step 3: Define the profiler with layer-wise profiling
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],  # Profile on CPU and GPU (if available)
    schedule=torch.profiler.schedule(wait=0, warmup=0, active=1),  # Profile immediately (no warmup, active for 1 step)
    record_shapes=True,  # Record shapes of the input tensors
    profile_memory=True,  # Track memory usage
    with_stack=True  # Capture stack traces to see which parts of the code are running
) as profiler:
    # Step 4: Register hooks to assign unique names to each layer during the forward pass
    layer_names = []
    outputs = []

    def hook_fn(module, input, output):
        unique_name = get_unique_layer_name(module)
        layer_names.append(unique_name)
        outputs.append(profiler.key_averages().table(sort_by="cuda_time_total"))

    # Attach the hook to each module (layer)
    hooks = []
    for layer in model.modules():
        if len(list(layer.children())) == 0:  # Leaf modules only
            hooks.append(layer.register_forward_hook(hook_fn))

    # Step 5: Run the inference
    with torch.no_grad():
        for _ in range(10):  # Run 10 inference steps
            output = model(input_data)
            profiler.step()  # Step the profiler after each iteration

    # Remove all hooks
    for hook in hooks:
        hook.remove()

# Step 6: Collect and print the profiling results in a tabular format
data = {
    'Layer Name': [],
    'CPU Time (ms)': [],
    'CUDA Time (ms)': [],
    'Memory (MB)': []
}

for idx, row in enumerate(profiler.key_averages()):
    data['Layer Name'].append(layer_names[idx])
    data['CPU Time (ms)'].append(row.cpu_time_total / 1000)  # Convert from ns to ms
    data['CUDA Time (ms)'].append(row.cuda_time_total / 1000)  # Convert from ns to ms
    data['Memory (MB)'].append(row.self_cpu_memory_usage / (1024 * 1024))  # Convert from bytes to MB

# Convert to a Pandas DataFrame for better display
df = pd.DataFrame(data)

import ace_tools as tools; tools.display_dataframe_to_user(name="Layer Profiling Results", dataframe=df)


RuntimeError: Profiler didn't finish running

In [None]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Function to format and print profiler events in a table
def print_profiler_events(profiler):
    # Prepare header
    print(f"{'Name':<40} {'Self CPU (us)':<15} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'Self CPU Mem (bytes)':<20} {'Self CUDA Mem (bytes)':<20}")
    print("-" * 150)

    # Initialize cumulative sums for CPU/GPU times and memory
    total_cpu_time = 0
    total_cuda_time = 0
    total_cpu_mem = 0
    total_cuda_mem = 0

    # Iterate through each event and print the results
    for event in profiler.events():
        cpu_time = event.self_cpu_time_total
        cuda_time = event.cuda_time_total
        cpu_mem = event.self_cpu_memory_usage or 0
        cuda_mem = event.self_cuda_memory_usage or 0

        # Add to cumulative sums
        total_cpu_time += cpu_time
        total_cuda_time += cuda_time
        total_cpu_mem += cpu_mem
        total_cuda_mem += cuda_mem

        # Print each event in tabular format
        print(f"{event.name:<40} {cpu_time:<15.2f} {event.cpu_time_total:<15.2f} {cuda_time:<15.2f} {cpu_mem:<20} {cuda_mem:<20}")

    # Print cumulative totals
    print("-" * 150)
    print(f"{'Total':<40} {total_cpu_time:<15.2f} {'-':<15} {total_cuda_time:<15.2f} {total_cpu_mem:<20} {total_cuda_mem:<20}")
    print("-" * 150)

# Function to profile the forward pass and print all recorded events in tabular format
def profile_layer_execution(model, device, dataloader):
    model.to(device)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,  # Enable memory profiling
        with_stack=False,      # Enable stack traces to trace back to source code
        with_flops=True
    ) as profiler:
        for step, (inputs, targets) in enumerate(dataloader):
            if step >= 1:
                break  # Limit to one batch for simplicity
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            # Synchronize CUDA operations before stepping profiler
            if device == 'cuda':
                torch.cuda.synchronize()

    # Print the profiling events in a table format
    print_profiler_events(profiler)

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['FFN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            # Using the extended FFN with 6 hidden layers
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=2)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling FFN on CPU...
Name                                     Self CPU (us)   CPU Total (us)  CUDA Total (us) Self CPU Mem (bytes) Self CUDA Mem (bytes)
------------------------------------------------------------------------------------------------------------------------------------------------------
aten::empty                              9.77            9.77            0.00            8                    0                   
aten::random_                            11.53           11.53           0.00            0                    0                   
aten::item                               6.12            9.60            0.00            0                    0                   
aten::_local_scalar_dense                3.48            3.48            0.00            0                    0                   
[memory]                                 0.00            0.00            0.00            -8                   0                   
enumerate(DataLoader)#_SingleProcessDa

  cuda_time = event.cuda_time_total
  cuda_mem = event.self_cuda_memory_usage or 0


Name                                     Self CPU (us)   CPU Total (us)  CUDA Total (us) Self CPU Mem (bytes) Self CUDA Mem (bytes)
------------------------------------------------------------------------------------------------------------------------------------------------------
aten::empty                              12.05           12.05           0.00            8                    0                   
aten::random_                            12.15           12.15           0.00            0                    0                   
aten::item                               3.67            6.61            0.00            0                    0                   
aten::_local_scalar_dense                2.94            2.94            0.00            0                    0                   
[memory]                                 0.00            0.00            0.00            -8                   0                   
enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__ 56