In [3]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu1(x)
        x = self.pool1(x)           # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu2(x)
        x = self.pool2(x)           # Pooling Layer 2
        x = self.flatten(x)         # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu3(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64


In [4]:
# Function to wrap forward methods of each layer
def wrap_forward_methods(model):
    for idx, (name, module) in enumerate(model.named_modules()):
        # Avoid wrapping the model itself
        if module != model:
            original_forward = module.forward
            layer_name = f"{name}_{idx}"  # Include numbering for uniqueness

            def wrapped_forward(*inputs, original_forward=original_forward, layer_name=layer_name, **kwargs):
                with torch.profiler.record_function(layer_name):
                    return original_forward(*inputs, **kwargs)

            module.forward = wrapped_forward

# Function to profile the model layer by layer
def profile_model_layer_by_layer(model, device, dataloader):
    model.to(device)
    wrap_forward_methods(model)

    # Warm-up
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            model(inputs)
            break  # Only need one batch for warm-up

    # Set up the profiler
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=False,
        profile_memory=True
    ) as profiler:
        # Run one forward pass
        with torch.no_grad():
            for inputs, targets in dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                model(inputs)
                break  # Only need one batch

    # Process the profiler events
    process_profiler_events(profiler)

# Function to process profiler events and print per-layer metrics
def process_profiler_events(profiler):
    # Prepare header
    print(f"{'Layer Name':<40} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'CPU Mem (bytes)':<15} {'CUDA Mem (bytes)':<15}")
    print("-" * 100)

    # Aggregate stats per layer
    aggregated_stats = {}

    for event in profiler.events():
        # Only consider events that we recorded with record_function (i.e., our layers)
        if event.name.startswith("enumerate(DataLoader"):
            continue  # Skip DataLoader events
        if event.name in aggregated_stats:
            # Aggregate stats
            aggregated_stats[event.name]['cpu_time_total'] += event.cpu_time_total
            aggregated_stats[event.name]['cuda_time_total'] += event.cuda_time_total
            aggregated_stats[event.name]['cpu_memory_usage'] += event.cpu_memory_usage
            aggregated_stats[event.name]['cuda_memory_usage'] += event.cuda_memory_usage
        else:
            aggregated_stats[event.name] = {
                'cpu_time_total': event.cpu_time_total,
                'cuda_time_total': event.cuda_time_total,
                'cpu_memory_usage': event.cpu_memory_usage,
                'cuda_memory_usage': event.cuda_memory_usage
            }

    # Print the aggregated stats
    total_cpu_time = 0
    total_cuda_time = 0
    total_cpu_mem = 0
    total_cuda_mem = 0

    for layer_name, stats in aggregated_stats.items():
        total_cpu_time += stats['cpu_time_total']
        total_cuda_time += stats['cuda_time_total']
        total_cpu_mem += stats['cpu_memory_usage']
        total_cuda_mem += stats['cuda_memory_usage']
        print(f"{layer_name:<40} {stats['cpu_time_total']:<15.2f} {stats['cuda_time_total']:<15.2f} {stats['cpu_memory_usage']:<15} {stats['cuda_memory_usage']:<15}")

    print("-" * 100)
    print(f"{'Total':<40} {total_cpu_time:<15.2f} {total_cuda_time:<15.2f} {total_cpu_mem:<15} {total_cuda_mem:<15}")
    print("-" * 100)


In [5]:
# def main():
#     for model_type in ['FFN', 'CNN']:
#         print(f"Profiling {model_type} layer by layer...")
#         # Get the model and data
#         if model_type == 'FFN':
#             model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
#         else:
#             model = CNN()

#         inputs, targets, batch_size = get_data_for_model(model_type)
#         dataset = TensorDataset(inputs, targets)
#         dataloader = DataLoader(dataset, batch_size=batch_size)

#         # Profile on CPU
#         profile_model_layer_by_layer(model, device='cpu', dataloader=dataloader)

#         # Profile on GPU if available
#         if torch.cuda.is_available():
#             print(f"Profiling {model_type} layer by layer on GPU...")
#             profile_model_layer_by_layer(model, device='cuda', dataloader=dataloader)
#         else:
#             print(f"CUDA is not available for {model_type}.")

# if __name__ == "__main__":
#     main()


In [6]:
import time

# Function to profile the whole model
def profile_whole_model(model, device, dataloader):
    model.to(device)

    # Warm-up
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            model(inputs)
            break  # Only need one batch for warm-up

    # Time the forward pass using time module
    with torch.no_grad():
        start_time = time.perf_counter()
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            break  # Only need one batch
        if device == 'cuda':
            torch.cuda.synchronize()
        end_time = time.perf_counter()
        total_wall_time = (end_time - start_time) * 1e6  # Convert to microseconds

    print(f"Wall-clock time for the forward pass: {total_wall_time:.2f} us")

    # Set up the profiler
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=False,
        profile_memory=True
    ) as profiler:
        # Run one forward pass
        with torch.no_grad():
            for inputs, targets in dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                model(inputs)
                break  # Only need one batch

    # Process the profiler events
    process_whole_model_profiler_events(profiler, total_wall_time)

# Function to process profiler events for the whole model
def process_whole_model_profiler_events(profiler, wall_time):
    # Sum up the total CPU and CUDA time and memory usage
    total_cpu_time = 0
    total_cuda_time = 0
    total_cpu_mem = 0
    total_cuda_mem = 0

    for event in profiler.events():
        total_cpu_time += event.cpu_time_total
        total_cuda_time += event.cuda_time_total
        total_cpu_mem += event.cpu_memory_usage
        total_cuda_mem += event.cuda_memory_usage

    print("-" * 60)
    print(f"{'Metric':<20} {'CPU':<15} {'CUDA':<15}")
    print("-" * 60)
    print(f"{'Total Time (us)':<20} {total_cpu_time:<15.2f} {total_cuda_time:<15.2f}")
    print(f"{'Total Memory (bytes)':<20} {total_cpu_mem:<15} {total_cuda_mem:<15}")
    print("-" * 60)
    total_profiler_time = total_cpu_time + total_cuda_time
    total_profiler_mem = total_cpu_mem + total_cuda_mem
    print(f"Total Profiler Time: {total_profiler_time:.2f} us")
    print(f"Total Profiler Memory Usage: {total_profiler_mem} bytes")
    print(f"Wall-clock Time: {wall_time:.2f} us")
    print(f"Difference between Profiler Time and Wall-clock Time: {abs(total_profiler_time - wall_time):.2f} us")
    print("-" * 60)


In [7]:
def main():
    for model_type in ['FFN', 'CNN']:
        print(f"Profiling {model_type} layer by layer...")
        # Get the model and data
        if model_type == 'FFN':
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile layer by layer
        profile_model_layer_by_layer(model, device='cpu', dataloader=dataloader)

        # Profile whole model
        print(f"Profiling {model_type} as a whole...")
        profile_whole_model(model, device='cpu', dataloader=dataloader)

        # Profile on GPU if available
        if torch.cuda.is_available():
            print(f"Profiling {model_type} layer by layer on GPU...")
            profile_model_layer_by_layer(model, device='cuda', dataloader=dataloader)

            print(f"Profiling {model_type} as a whole on GPU...")
            profile_whole_model(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling FFN layer by layer...


  warn("CUDA is not available, disabling CUDA profiling")


Layer Name                               CPU Total (us)  CUDA Total (us) CPU Mem (bytes) CUDA Mem (bytes)
----------------------------------------------------------------------------------------------------
aten::empty                              266.00          0.00            8               0              
aten::random_                            24.00           0.00            0               0              
aten::item                               13.00           0.00            0               0              
aten::_local_scalar_dense                5.00            0.00            0               0              
[memory]                                 0.00            0.00            -130568         0              
aten::select                             974.00          0.00            0               0              
aten::as_strided                         227.00          0.00            0               0              
aten::stack                              1514.00         0