## Found a new method to profile the layers of the DNN. Unlike the previous methods ,instead of trying to entirely using the hook-based or profiler with stack trace or module heirachy approach , I was able to profile each layer of the any DNN , entirely by using a method were, I attached a hook for every layer and once that hook was triggere a profiler is run. In this , I step through each step to make sure the profiling is accurate.

In [45]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Function to profile the forward pass and print all recorded events in tabular format
def profile_layer_execution(model, device, dataloader):
    model.to(device)
    layer_profiles = register_hooks(model)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True
    ) as profiler:
        # Run one forward pass through the model
        with torch.no_grad():
            for step, (inputs, targets) in enumerate(dataloader):
                if step >= 1:
                    break  # Limit to one batch for simplicity
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                profiler.step()

    # Print the profiling events for each layer
    print_profiler_events(profiler, model)

# Function to register hooks for capturing profiling data per layer
def register_hooks(model):
    def hook_wrapper(layer_name):
        def hook(module, input, output):
            with torch.profiler.record_function(layer_name):
                pass
        return hook

    for idx, (name, layer) in enumerate(model.named_modules()):
        if not isinstance(layer, nn.Sequential) and not isinstance(layer, nn.ModuleList) and layer != model:
            layer.register_forward_hook(hook_wrapper(f"{name}_{idx}"))

# Function to format and print profiler events in a table
def print_profiler_events(profiler, model):
    # Get all named modules (layers) from the model
    layer_names = dict(model.named_modules())

    # Prepare header
    print(f"{'Layer Name':<40} {'Self CPU (us)':<15} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'Self CPU Mem (bytes)':<20} {'Self CUDA Mem (bytes)':<20}")
    print("-" * 150)

    # Iterate through profiler events to aggregate per layer
    for event in profiler.events():
        for name, module in layer_names.items():
            if name in event.name:
                print(f"{name:<40} {event.self_cpu_time_total:<15.2f} {event.cpu_time_total:<15.2f} {event.cuda_time_total:<15.2f} {event.self_cpu_memory_usage:<20} {event.self_cuda_memory_usage:<20}")

    print("-" * 150)

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['FFN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            # Using the extended FFN with specified hidden layers
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=2)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling FFN on CPU...
Layer Name                               Self CPU (us)   CPU Total (us)  CUDA Total (us) Self CPU Mem (bytes) Self CUDA Mem (bytes)
------------------------------------------------------------------------------------------------------------------------------------------------------
                                         10.35           10.35           0.00            8                    0                   
                                         10.74           10.74           0.00            0                    0                   
                                         3.56            6.88            0.00            0                    0                   
                                         3.31            3.31            0.00            0                    0                   
                                         0.00            0.00            0.00            -8                   0                   
                                      

  print(f"{name:<40} {event.self_cpu_time_total:<15.2f} {event.cpu_time_total:<15.2f} {event.cuda_time_total:<15.2f} {event.self_cpu_memory_usage:<20} {event.self_cuda_memory_usage:<20}")
  print(f"{name:<40} {event.self_cpu_time_total:<15.2f} {event.cpu_time_total:<15.2f} {event.cuda_time_total:<15.2f} {event.self_cpu_memory_usage:<20} {event.self_cuda_memory_usage:<20}")


                                         12.16           12.16           0.00            8                    0                   
                                         11.80           11.80           0.00            0                    0                   
                                         3.62            7.04            0.00            0                    0                   
                                         3.42            3.42            0.00            0                    0                   
                                         0.00            0.00            0.00            -8                   0                   
                                         501.20          1042.41         0.00            0                    0                   
                                         15.74           18.26           0.00            0                    0                   
                                         2.53            2.53            0.00      

## Main Code for Profiling , in this update I added a warmup phase , to make sure that the profiler captures only the excuetion profile and not anything else i.e not the gpu/cpu spin-up time,basically this means that it reduces the overhead associated with starting up processes. I also corrected the aggregation function over here, to correctly include all the data

In [46]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset

# Function to profile the forward pass and print all recorded events in tabular format
def profile_layer_execution(model, device, dataloader,warmup_iters=5):
    model.to(device)
    layer_profiles = register_hooks(model)

    # Warmup phase to stabilize the model and GPU (if applicable)
    with torch.no_grad():
        for _ in range(warmup_iters):
            for inputs, targets in dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                model(inputs)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True
    ) as profiler:
        # Run one forward pass through the model
        with torch.no_grad():
            for step, (inputs, targets) in enumerate(dataloader):
                if step >= 1:
                    break  # Limit to one batch for simplicity
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                profiler.step()

    # Print the profiling events for each layer
    print_profiler_events(profiler, model)

# Function to register hooks for capturing profiling data per layer
def register_hooks(model):
    def hook_wrapper(layer_name):
        def hook(module, input, output):
            with torch.profiler.record_function(layer_name):
                pass
        return hook

    for idx, (name, layer) in enumerate(model.named_modules()):
        if not isinstance(layer, nn.Sequential) and not isinstance(layer, nn.ModuleList) and layer != model:
            layer.register_forward_hook(hook_wrapper(f"{name}_{idx}"))

# Function to format and print profiler events in a table
def print_profiler_events(profiler, model):
    # Get all named modules (layers) from the model
    layer_names = dict(model.named_modules())

    # Prepare header
    print(f"{'Layer Name':<40} {'Self CPU (us)':<15} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'Self CPU Mem (bytes)':<20} {'Self CUDA Mem (bytes)':<20}")
    print("-" * 150)

    # Dictionary to aggregate profiler events per layer
    aggregated_stats = {}

    # Iterate through profiler events to aggregate per layer
    for event in profiler.events():
        for name, module in layer_names.items():
            if name in event.name:
                if name not in aggregated_stats:
                    aggregated_stats[name] = {
                        'self_cpu_time_total': 0,
                        'cpu_time_total': 0,
                        'cuda_time_total': 0,
                        'self_cpu_memory_usage': 0,
                        'self_cuda_memory_usage': 0
                    }
                aggregated_stats[name]['self_cpu_time_total'] += event.self_cpu_time_total
                aggregated_stats[name]['cpu_time_total'] += event.cpu_time_total
                aggregated_stats[name]['cuda_time_total'] += event.device_time_total  # Updated to use device_time_total
                aggregated_stats[name]['self_cpu_memory_usage'] += event.self_cpu_memory_usage
                aggregated_stats[name]['self_cuda_memory_usage'] += event.self_device_memory_usage  # Updated to use self_device_memory_usage

    # Print aggregated stats per layer
    total_cpu_time = 0
    total_cuda_time = 0
    total_cpu_mem = 0
    total_cuda_mem = 0

    for name, stats in aggregated_stats.items():
        total_cpu_time += stats['cpu_time_total']
        total_cuda_time += stats['cuda_time_total']
        total_cpu_mem += stats['self_cpu_memory_usage']
        total_cuda_mem += stats['self_cuda_memory_usage']
        print(f"{name:<40} {stats['self_cpu_time_total']:<15.2f} {stats['cpu_time_total']:<15.2f} {stats['cuda_time_total']:<15.2f} {stats['self_cpu_memory_usage']:<20} {stats['self_cuda_memory_usage']:<20}")

    print("-" * 150)
    print(f"{'Total':<40} {total_cpu_time:<15.2f} {'-':<15} {total_cuda_time:<15.2f} {total_cpu_mem:<20} {total_cuda_mem:<20}")
    print("-" * 150)
    print(f"Total Execution Time (CPU + GPU): {total_cpu_time + total_cuda_time:.2f} us")
    print(f"Total Memory Used (CPU + GPU): {total_cpu_mem + total_cuda_mem} bytes")
    print("-" * 150)

# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['CNN', 'FFN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            # Using the extended FFN with specified hidden layers
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling CNN on CPU...
Layer Name                               Self CPU (us)   CPU Total (us)  CUDA Total (us) Self CPU Mem (bytes) Self CUDA Mem (bytes)
------------------------------------------------------------------------------------------------------------------------------------------------------
                                         24909.26        60206.97        0.00            203776               0                   
conv2                                    50.19           5590.66         0.00            0                    0                   
conv1                                    36.21           36.21           0.00            0                    0                   
relu                                     102.60          986.58          0.00            0                    0                   
pool                                     15858.66        31601.18        0.00            2408448              0                   
fc1                                   

## This the code for csv file with the update functionality.

## In this added the code to import the results to a global csv file which acts as profiling db. I also included the functionality to update the each layer if the excuetion time/memory increases thus and updating each layer accordingly in the global db as given by the DART paper. Remember this profiling function would be used during the resource allocation and stage decompostion algoirithm and then in the runtime phase be used as monitoring scirpt to update the global db(the csv file),and use the runtime enforcement logic as discussed in the paper.

#### **Note - The excuetion time increase is priotized over the memory increase and therefore the row(the record) is updated in only two conditions, if the exucetion time is greater or the memory usage is higher and the excuetion time is lower as well

In [51]:
import torch
import torch.nn as nn
import torch.profiler
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import os

# Global dataframe to store profiling results
profiling_db = pd.DataFrame(columns=[
    'Model', 'Layer Name','Compute', 'Self CPU (us)', 'CPU Total (us)', 'CUDA Total (us)',
    'Self CPU Mem (bytes)', 'Self CUDA Mem (bytes)', 'Total Execution Time (us)',
    'Total Memory Used (bytes)',   # Added 'Compute' column
])


# Function to profile the forward pass and print all recorded events in tabular format
def profile_layer_execution(model, device, dataloader, model_name, profiler_output_csv='model_profiling_results.csv', warmup_iters=5):
    global profiling_db
    model.to(device)
    layer_profiles = register_hooks(model)

    # Warm-up phase to stabilize the model and GPU (if applicable)
    with torch.no_grad():
        for _ in range(warmup_iters):
            for inputs, targets in dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                model(inputs)

    # If the CSV exists, load it into the global dataframe
    if os.path.exists(profiler_output_csv):
        profiling_db = pd.read_csv(profiler_output_csv)

    # Use PyTorch profiler to capture execution time and memory usage
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        record_shapes=False,
        profile_memory=True
    ) as profiler:
        # Run one forward pass through the model
        with torch.no_grad():
            for step, (inputs, targets) in enumerate(dataloader):
                if step >= 1:
                    break  # Limit to one batch for simplicity
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                profiler.step()

    # Print the profiling events for each layer
    aggregated_stats = print_profiler_events(profiler, model,device)

    # Extract profiling data and save to global profile DB
    profile_and_save(aggregated_stats, model_name, profiler_output_csv)

# Function to register hooks for capturing profiling data per layer
def register_hooks(model):
    def hook_wrapper(layer_name):
        def hook(module, input, output):
            with torch.profiler.record_function(layer_name):
                pass
        return hook

    for idx, (name, layer) in enumerate(model.named_modules()):
        if not isinstance(layer, nn.Sequential) and not isinstance(layer, nn.ModuleList) and layer != model:
            layer.register_forward_hook(hook_wrapper(f"{name}_{idx}"))

# Function to extract and save profiler data to CSV
def profile_and_save(aggregated_stats, model_name, profiler_output_csv):
    global profiling_db
    # Determine the compute type
    # compute_type = "CPU+GPU" if "cuda" in device else "CPU"

    # if "" in aggregated_stats.keys():
    # aggregated_stats["new_key"] = aggregated_stats.pop("")
    # Convert aggregated stats into a DataFrame
    profile_data = {
        'Model': [],
        'Layer Name': [],
        'Self CPU (us)': [],
        'CPU Total (us)': [],
        'CUDA Total (us)': [],
        'Self CPU Mem (bytes)': [],
        'Self CUDA Mem (bytes)': [],
        'Total Execution Time (us)': [],
        'Total Memory Used (bytes)': [],
        'Compute': []
    }

    for layer_name, stats in aggregated_stats.items():
        profile_data['Model'].append(model_name)
        profile_data['Layer Name'].append(layer_name)
        profile_data['Self CPU (us)'].append(stats['self_cpu_time_total'])
        profile_data['CPU Total (us)'].append(stats['cpu_time_total'])
        profile_data['CUDA Total (us)'].append(stats['cuda_time_total'])
        profile_data['Self CPU Mem (bytes)'].append(stats['self_cpu_memory_usage'])
        profile_data['Self CUDA Mem (bytes)'].append(stats['self_cuda_memory_usage'])
        profile_data['Total Execution Time (us)'].append(stats['cpu_time_total'] + stats['cuda_time_total'])
        profile_data['Total Memory Used (bytes)'].append(stats['self_cpu_memory_usage'] + stats['self_cuda_memory_usage'])
        profile_data['Compute'].append(stats['compute'])

    new_results_df = pd.DataFrame(profile_data)

    # Update the profiling DB with new data
    updated_layers = []
    #  updated_layers = []
    for index, row in new_results_df.iterrows():
        existing_row = profiling_db[
            (profiling_db['Model'] == row['Model']) &
            (profiling_db['Layer Name'] == row['Layer Name']) &
            (profiling_db['Compute'] == row['Compute'])
        ]

        if not existing_row.empty:
            # Compare and update if the new results are greater
            update_needed = (
                row['Total Execution Time (us)'] > existing_row['Total Execution Time (us)'].values[0] or
                row['Total Memory Used (bytes)'] > existing_row['Total Memory Used (bytes)'].values[0]
            )

            if update_needed:
                profiling_db = profiling_db[
                    ~((profiling_db['Model'] == row['Model']) &
                      (profiling_db['Layer Name'] == row['Layer Name']) &
                      (profiling_db['Compute'] == row['Compute']))
                ]
                profiling_db = pd.concat([profiling_db, pd.DataFrame([row])], ignore_index=True)
                updated_layers.append(row['Layer Name'])
        else:
            # If the layer doesn't exist, add it
            profiling_db = pd.concat([profiling_db, pd.DataFrame([row])], ignore_index=True)
            updated_layers.append(row['Layer Name'])

    # Save the updated profiling results to CSV
    profiling_db.to_csv(profiler_output_csv, index=False)

    # Print which layers were updated
    print(f"Profiling for model {model_name} completed. Data saved to {profiler_output_csv}.")
    if updated_layers:
        print(f"Updated layers: {', '.join(updated_layers)}")
    else:
        print("No layers were updated.")

def print_profiler_events(profiler, model,device):
    # Get all named modules (layers) from the model
    layer_names = dict(model.named_modules())

    # Prepare header
    print(f"{'Layer Name':<40} {'Self CPU (us)':<15} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'Self CPU Mem (bytes)':<20} {'Self CUDA Mem (bytes)':<20} {'Compute':<10}")
    print("-" * 180)

    # Dictionary to aggregate profiler events per layer
    aggregated_stats = {}

    # Iterate through profiler events to aggregate per layer
    for event in profiler.events():
        for name, module in layer_names.items():
            if name in event.name:
                if name not in aggregated_stats:
                    aggregated_stats[name] = {
                        'self_cpu_time_total': 0,
                        'cpu_time_total': 0,
                        'cuda_time_total': 0,
                        'self_cpu_memory_usage': 0,
                        'self_cuda_memory_usage': 0,
                        'compute': "CPU+GPU" if "cuda" in device else "CPU"
                    }
                aggregated_stats[name]['self_cpu_time_total'] += event.self_cpu_time_total
                aggregated_stats[name]['cpu_time_total'] += event.cpu_time_total
                aggregated_stats[name]['cuda_time_total'] += event.device_time_total
                aggregated_stats[name]['self_cpu_memory_usage'] += event.self_cpu_memory_usage
                aggregated_stats[name]['self_cuda_memory_usage'] += event.self_device_memory_usage

    # Print aggregated stats per layer
    total_cpu_time = 0
    total_cuda_time = 0
    total_cpu_mem = 0
    total_cuda_mem = 0

    aggregated_stats["misc"] = aggregated_stats.pop("")

    for name, stats in aggregated_stats.items():
        total_cpu_time += stats['cpu_time_total']
        total_cuda_time += stats['cuda_time_total']
        total_cpu_mem += stats['self_cpu_memory_usage']
        total_cuda_mem += stats['self_cuda_memory_usage']
        print(f"{name:<40} {stats['self_cpu_time_total']:<15.2f} {stats['cpu_time_total']:<15.2f} {stats['cuda_time_total']:<15.2f} {stats['self_cpu_memory_usage']:<20} {stats['self_cuda_memory_usage']:<20} {stats['compute']:<10}")

    print("-" * 180)
    print(f"{'Total':<40} {'-':<15} {total_cpu_time:<15.2f} {total_cuda_time:<15.2f} {total_cpu_mem:<20} {total_cuda_mem:<20}")
    print("-" * 180)
    print(f"Total Execution Time (CPU + GPU): {total_cpu_time + total_cuda_time:.2f} us")
    print(f"Total Memory Used (CPU + GPU): {total_cpu_mem + total_cuda_mem} bytes")
    print("-" * 180)

    return aggregated_stats


# Function to select the correct data based on the model type (FFN or CNN)
def get_data_for_model(model_type):
    if model_type == 'FFN':
        # Data for FFN
        input_size = 100
        batch_size = 64
        inputs = torch.randn(batch_size, input_size)
        targets = torch.randint(0, 10, (batch_size,))
        return inputs, targets, batch_size
    elif model_type == 'CNN':
        # Data for CNN
        input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
        inputs = torch.randn(input_shape)
        targets = torch.randint(0, 10, (64,))
        return inputs, targets, 64

# Define an extended Feedforward Neural Network (FFN) with more layers
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x

# Define a Convolutional Neural Network (CNN)
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.conv1(x)           # Convolution Layer 1
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 1
        x = self.conv2(x)           # Convolution Layer 2
        x = self.relu(x)
        x = self.pool(x)            # Pooling Layer 2
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = self.fc1(x)             # Fully Connected Layer 1
        x = self.relu(x)
        x = self.dropout(x)         # Dropout for regularization
        x = self.fc2(x)             # Fully Connected Layer 2
        return x

# Run profiling on CPU and GPU for both FFN and CNN
def main():
    for model_type in ['CNN', 'FFN']:
        print(f"Profiling {model_type} on CPU...")

        # Get the correct model and data
        if model_type == 'FFN':
            # Using the extended FFN with specified hidden layers
            model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
        else:
            model = CNN()

        inputs, targets, batch_size = get_data_for_model(model_type)
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size)

        # Profile on CPU
        profile_layer_execution(model, device='cpu', dataloader=dataloader, model_name=model_type)

        # Profile on GPU (if available)
        if torch.cuda.is_available():
            print(f"Profiling {model_type} on GPU...")
            profile_layer_execution(model, device='cuda', dataloader=dataloader, model_name=model_type)
        else:
            print(f"CUDA is not available for {model_type}.")

if __name__ == "__main__":
    main()


Profiling CNN on CPU...
Layer Name                               Self CPU (us)   CPU Total (us)  CUDA Total (us) Self CPU Mem (bytes) Self CUDA Mem (bytes) Compute   
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
conv2                                    44.02           5917.60         0.00            0                    0                    CPU       
conv1                                    36.42           36.42           0.00            0                    0                    CPU       
relu                                     168.43          1152.96         0.00            0                    0                    CPU       
pool                                     15597.80        31092.55        0.00            2408448              0                    CPU       
fc1                                      62.98           62.98           0.00       

## Final updated

In [48]:
# import torch
# import torch.nn as nn
# import torch.profiler
# from torch.utils.data import DataLoader, TensorDataset
# import pandas as pd
# import os

# # Set the seed for consistent profiling
# torch.manual_seed(42)

# # Global dataframe to store profiling results
# profiling_db = pd.DataFrame(columns=[
#     'Model', 'Layer Name', 'CPU Total (us)', 'CUDA Total (us)',
#     'CPU Mem (bytes)', 'CUDA Mem (bytes)', 'Total Execution Time (us)',
#     'Total Memory Used (bytes)', 'Compute Node'
# ])

# # Function to profile the forward pass and print all recorded events in tabular format
# def profile_layer_execution(model, device, dataloader, model_name, profiler_output_csv='model_profiling_results.csv', warmup_iters=5):
#     global profiling_db
#     model.to(device)

#     # Warm-up phase to stabilize the model and GPU (if applicable)
#     with torch.no_grad():
#         for _ in range(warmup_iters):
#             for inputs, targets in dataloader:
#                 inputs, targets = inputs.to(device), targets.to(device)
#                 model(inputs)

#     # If the CSV exists, load it into the global dataframe
#     if os.path.exists(profiler_output_csv):
#         profiling_db = pd.read_csv(profiler_output_csv)

#     # Use PyTorch profiler to capture execution time and memory usage
#     with torch.profiler.profile(
#         activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
#         record_shapes=False,
#         profile_memory=True
#     ) as profiler:
#         # Run one forward pass through the model
#         with torch.no_grad():
#             for step, (inputs, targets) in enumerate(dataloader):
#                 if step >= 1:
#                     break  # Limit to one batch for simplicity
#                 inputs, targets = inputs.to(device), targets.to(device)
#                 outputs = model(inputs)
#                 profiler.step()

#     # Print the profiling events for each layer
#     aggregated_stats = print_profiler_events(profiler, model)

#     # Extract profiling data and save to global profile DB
#     profile_and_save(aggregated_stats, model_name, profiler_output_csv, device)

# # Function to extract and save profiler data to CSV
# def profile_and_save(aggregated_stats, model_name, profiler_output_csv, device):
#     global profiling_db

#     # Compute node based on the device
#     compute_node = "CPU+GPU" if "cuda" in device else "CPU"

#     # Convert aggregated stats into a DataFrame
#     profile_data = {
#         'Model': [],
#         'Layer Name': [],
#         'CPU Total (us)': [],
#         'CUDA Total (us)': [],
#         'CPU Mem (bytes)': [],
#         'CUDA Mem (bytes)': [],
#         'Total Execution Time (us)': [],
#         'Total Memory Used (bytes)': [],
#         'Compute Node': []
#     }

#     for layer_name, stats in aggregated_stats.items():
#         # Replace empty layer names with "misc"
#         layer_name = layer_name.strip() if layer_name.strip() else "misc"

#         total_time = stats['cpu_time_total'] + stats['cuda_time_total']
#         total_memory = stats['cpu_memory_usage'] + stats['cuda_memory_usage']

#         profile_data['Model'].append(model_name)
#         profile_data['Layer Name'].append(layer_name)
#         profile_data['CPU Total (us)'].append(stats['cpu_time_total'])
#         profile_data['CUDA Total (us)'].append(stats['cuda_time_total'])
#         profile_data['CPU Mem (bytes)'].append(stats['cpu_memory_usage'])
#         profile_data['CUDA Mem (bytes)'].append(stats['cuda_memory_usage'])
#         profile_data['Total Execution Time (us)'].append(total_time)
#         profile_data['Total Memory Used (bytes)'].append(total_memory)
#         profile_data['Compute Node'].append(compute_node)

#     new_results_df = pd.DataFrame(profile_data)

#     # Update the profiling DB with new data
#     updated_layers = []
#     for index, row in new_results_df.iterrows():
#         existing_row = profiling_db[
#             (profiling_db['Model'] == row['Model']) &
#             (profiling_db['Layer Name'] == row['Layer Name']) &
#             (profiling_db['Compute Node'] == row['Compute Node'])
#         ]

#         if not existing_row.empty:
#             # Compare and update if the new results are greater
#             update_needed = (
#                 row['Total Execution Time (us)'] > existing_row['Total Execution Time (us)'].values[0] or
#                 row['Total Memory Used (bytes)'] > existing_row['Total Memory Used (bytes)'].values[0]
#             )

#             if update_needed:
#                 profiling_db = profiling_db[
#                     ~((profiling_db['Model'] == row['Model']) &
#                       (profiling_db['Layer Name'] == row['Layer Name']) &
#                       (profiling_db['Compute Node'] == row['Compute Node']))
#                 ]
#                 profiling_db = pd.concat([profiling_db, pd.DataFrame([row])], ignore_index=True)
#                 updated_layers.append(row['Layer Name'])
#         else:
#             # If the layer doesn't exist, add it
#             profiling_db = pd.concat([profiling_db, pd.DataFrame([row])], ignore_index=True)
#             updated_layers.append(row['Layer Name'])

#     # Save the updated profiling results to CSV
#     profiling_db.to_csv(profiler_output_csv, index=False)

#     # Print which layers were updated
#     print(f"Profiling for model {model_name} completed. Data saved to {profiler_output_csv}.")
#     if updated_layers:
#         print(f"Updated layers: {', '.join(updated_layers)}")
#     else:
#         print("No layers were updated.")

# # Function to print profiler events in a table format
# def print_profiler_events(profiler, model):
#     # Get all named modules (layers) from the model
#     layer_names = dict(model.named_modules())

#     # Prepare header
#     print(f"{'Layer Name':<40} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'CPU Mem (bytes)':<20} {'CUDA Mem (bytes)':<20}")
#     print("-" * 120)

#     # Dictionary to aggregate profiler events per layer
#     aggregated_stats = {}

#     for event in profiler.events():
#         for name, module in layer_names.items():
#             if name in event.name:
#                 if name not in aggregated_stats:
#                     aggregated_stats[name] = {
#                         'cpu_time_total': 0,
#                         'cuda_time_total': 0,
#                         'cpu_memory_usage': 0,
#                         'cuda_memory_usage': 0
#                     }
#                 aggregated_stats[name]['cpu_time_total'] += event.cpu_time_total
#                 aggregated_stats[name]['cuda_time_total'] += event.device_time_total
#                 aggregated_stats[name]['cpu_memory_usage'] += event.cpu_memory_usage
#                 aggregated_stats[name]['cuda_memory_usage'] += event.device_memory_usage

#     # Print aggregated stats per layer
#     for name, stats in aggregated_stats.items():
#         layer_name = name.strip() if name.strip() else "misc"
#         print(f"{layer_name:<40} {stats['cpu_time_total']:<15.2f} {stats['cuda_time_total']:<15.2f} {stats['cpu_memory_usage']:<20} {stats['cuda_memory_usage']:<20}")

#     print("-" * 120)
#     return aggregated_stats


# # Add your CNN and FFN definitions and the `main()` function as needed.

# # Function to select the correct data based on the model type (FFN or CNN)
# def get_data_for_model(model_type):
#     if model_type == 'FFN':
#         # Data for FFN
#         input_size = 100
#         batch_size = 64
#         inputs = torch.randn(batch_size, input_size)
#         targets = torch.randint(0, 10, (batch_size,))
#         return inputs, targets, batch_size
#     elif model_type == 'CNN':
#         # Data for CNN
#         input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
#         inputs = torch.randn(input_shape)
#         targets = torch.randint(0, 10, (64,))
#         return inputs, targets, 64

# # Define an extended Feedforward Neural Network (FFN) with more layers
# class ExtendedFFN(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
#         super(ExtendedFFN, self).__init__()

#         self.input_layer = nn.Linear(input_size, hidden_size)
#         self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
#         self.output_layer = nn.Linear(hidden_size, output_size)
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         x = self.input_layer(x)
#         x = self.relu(x)

#         # Apply hidden layers with ReLU activations
#         for hidden_layer in self.hidden_layers:
#             x = hidden_layer(x)
#             x = self.relu(x)

#         x = self.output_layer(x)
#         return x

# # Define a Convolutional Neural Network (CNN)
# class CNN(nn.Module):
#     def __init__(self):
#         super(CNN, self).__init__()
#         self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
#         self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
#         self.fc1 = nn.Linear(64 * 7 * 7, 128)
#         self.fc2 = nn.Linear(128, 10)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.conv1(x)           # Convolution Layer 1
#         x = self.relu(x)
#         x = self.pool(x)            # Pooling Layer 1
#         x = self.conv2(x)           # Convolution Layer 2
#         x = self.relu(x)
#         x = self.pool(x)            # Pooling Layer 2
#         x = x.view(-1, 64 * 7 * 7)  # Flatten
#         x = self.fc1(x)             # Fully Connected Layer 1
#         x = self.relu(x)
#         x = self.dropout(x)         # Dropout for regularization
#         x = self.fc2(x)             # Fully Connected Layer 2
#         return x

# # Run profiling on CPU and GPU for both FFN and CNN
# def main():
#     for model_type in ['CNN', 'FFN']:
#         print(f"Profiling {model_type} on CPU...")

#         # Get the correct model and data
#         if model_type == 'FFN':
#             # Using the extended FFN with specified hidden layers
#             model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
#         else:
#             model = CNN()

#         inputs, targets, batch_size = get_data_for_model(model_type)
#         dataset = TensorDataset(inputs, targets)
#         dataloader = DataLoader(dataset, batch_size=batch_size)

#         # Profile on CPU
#         profile_layer_execution(model, device='cpu', dataloader=dataloader, model_name=model_type)

#         # Profile on GPU (if available)
#         if torch.cuda.is_available():
#             print(f"Profiling {model_type} on GPU...")
#             profile_layer_execution(model, device='cuda', dataloader=dataloader, model_name=model_type)
#         else:
#             print(f"CUDA is not available for {model_type}.")

# if __name__ == "__main__":
#     main()


In [49]:
# import torch
# import torch.nn as nn
# import torch.profiler
# from torch.utils.data import DataLoader, TensorDataset
# import pandas as pd
# import os

# # Global dataframe to store profiling results
# profiling_db = pd.DataFrame(columns=[
#     'Model', 'Layer Name', 'Self CPU (us)', 'CPU Total (us)', 'CUDA Total (us)',
#     'Self CPU Mem (bytes)', 'Self CUDA Mem (bytes)', 'Total Execution Time (us)',
#     'Total Memory Used (bytes)'
# ])

# # Function to profile the forward pass and print all recorded events in tabular format
# def profile_layer_execution(model, device, dataloader, model_name, profiler_output_csv='model_profiling_results.csv', warmup_iters=5):
#     global profiling_db
#     model.to(device)
#     layer_profiles = register_hooks(model)

#     # Warm-up phase to stabilize the model and GPU (if applicable)
#     with torch.no_grad():
#         for _ in range(warmup_iters):
#             for inputs, targets in dataloader:
#                 inputs, targets = inputs.to(device), targets.to(device)
#                 model(inputs)

#     # If the CSV exists, load it into the global dataframe
#     if os.path.exists(profiler_output_csv):
#         profiling_db = pd.read_csv(profiler_output_csv)

#     # Use PyTorch profiler to capture execution time and memory usage
#     with torch.profiler.profile(
#         activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
#         record_shapes=True,
#         profile_memory=True
#     ) as profiler:
#         # Run one forward pass through the model
#         with torch.no_grad():
#             for step, (inputs, targets) in enumerate(dataloader):
#                 if step >= 1:
#                     break  # Limit to one batch for simplicity
#                 inputs, targets = inputs.to(device), targets.to(device)
#                 # Wrap the model call with a profiling context to profile the complete forward pass
#                 with torch.profiler.record_function('model_forward'):
#                     outputs = model(inputs)
#                 profiler.step()

#     # Print the profiling events for each layer
#     aggregated_stats = print_profiler_events(profiler, model)

#     # Extract profiling data and save to global profile DB
#     profile_and_save(aggregated_stats, model_name, profiler_output_csv)


# # Function to register hooks for capturing profiling data per layer using wrapped forward
# def register_hooks(model):
#     # Dictionary to save the original forward methods, in case you want to restore them later
#     original_forwards = {}

#     for idx, (name, layer) in enumerate(model.named_modules()):
#         # Check if the layer is suitable for profiling (skip nn.Sequential, nn.ModuleList, and the main model)
#         if not isinstance(layer, nn.Sequential) and not isinstance(layer, nn.ModuleList) and layer != model:
#             # Avoid re-wrapping layers
#             if hasattr(layer, 'is_wrapped') and layer.is_wrapped:
#                 continue

#             # Save the original forward function
#             original_forward = layer.forward
#             layer_name = f"{name}_{idx}"
#             original_forwards[name] = original_forward

#             # Define a wrapped forward function with default arguments to capture variables
#             def wrapped_forward(*inputs, original_forward=original_forward, layer_name=layer_name, **kwargs):
#                 with torch.profiler.record_function(layer_name):
#                     return original_forward(*inputs, **kwargs)

#             # Replace the original forward function with the wrapped one
#             layer.forward = wrapped_forward
#             layer.is_wrapped = True  # Mark the layer as wrapped to avoid re-wrapping

#     # Return original forwards in case we need to restore them later
#     return original_forwards


# # Function to extract and save profiler data to CSV
# def profile_and_save(aggregated_stats, model_name, profiler_output_csv):
#     global profiling_db

#     # Convert aggregated stats into a DataFrame
#     profile_data = {
#         'Model': [],
#         'Layer Name': [],
#         'Self CPU (us)': [],
#         'CPU Total (us)': [],
#         'CUDA Total (us)': [],
#         'Self CPU Mem (bytes)': [],
#         'Self CUDA Mem (bytes)': [],
#         'Total Execution Time (us)': [],
#         'Total Memory Used (bytes)': []
#     }

#     for layer_name, stats in aggregated_stats.items():
#         profile_data['Model'].append(model_name)
#         profile_data['Layer Name'].append(layer_name)
#         profile_data['Self CPU (us)'].append(stats['self_cpu_time_total'])
#         profile_data['CPU Total (us)'].append(stats['cpu_time_total'])
#         profile_data['CUDA Total (us)'].append(stats['cuda_time_total'])
#         profile_data['Self CPU Mem (bytes)'].append(stats['self_cpu_memory_usage'])
#         profile_data['Self CUDA Mem (bytes)'].append(stats['self_cuda_memory_usage'])
#         profile_data['Total Execution Time (us)'].append(stats['cpu_time_total'] + stats['cuda_time_total'])
#         profile_data['Total Memory Used (bytes)'].append(stats['self_cpu_memory_usage'] + stats['self_cuda_memory_usage'])

#     new_results_df = pd.DataFrame(profile_data)

#     # Update the profiling DB with new data
#     updated_layers = []
#     for index, row in new_results_df.iterrows():
#         existing_row = profiling_db[
#             (profiling_db['Model'] == row['Model']) &
#             (profiling_db['Layer Name'] == row['Layer Name'])
#         ]

#         if not existing_row.empty:
#             # Compare and update if the new results are greater
#             update_needed = (
#                 row['Total Execution Time (us)'] > existing_row['Total Execution Time (us)'].values[0]
#             )

#             if update_needed:
#                 profiling_db = profiling_db[
#                     ~((profiling_db['Model'] == row['Model']) & (profiling_db['Layer Name'] == row['Layer Name']))
#                 ]
#                 profiling_db = pd.concat([profiling_db, pd.DataFrame([row])], ignore_index=True)
#                 updated_layers.append(row['Layer Name'])
#         else:
#             # If the layer doesn't exist, add it
#             profiling_db = pd.concat([profiling_db, pd.DataFrame([row])], ignore_index=True)
#             updated_layers.append(row['Layer Name'])

#     # Save the updated profiling results to CSV
#     profiling_db.to_csv(profiler_output_csv, index=False)

#     # Print which layers were updated
#     print(f"Profiling for model {model_name} completed. Data saved to {profiler_output_csv}.")
#     if updated_layers:
#         print(f"Updated layers: {', '.join(updated_layers)}")
#     else:
#         print("No layers were updated.")


# # Function to print profiler events in a table format
# def print_profiler_events(profiler, model):
#     # Get all named modules (layers) from the model
#     layer_names = dict(model.named_modules())

#     # Include 'model_forward' explicitly
#     # layer_names['model_forward'] = model

#     # Prepare header
#     print(f"{'Layer Name':<40} {'Self CPU (us)':<15} {'CPU Total (us)':<15} {'CUDA Total (us)':<15} {'Self CPU Mem (bytes)':<20} {'Self CUDA Mem (bytes)':<20}")
#     print("-" * 150)

#     # Dictionary to aggregate profiler events per layer
#     aggregated_stats = {}

#     # Iterate through profiler events to aggregate per layer
#     for event in profiler.events():
#         for name in layer_names.keys():
#             if name in event.name:
#                 if name not in aggregated_stats:
#                     aggregated_stats[name] = {
#                         'self_cpu_time_total': 0,
#                         'cpu_time_total': 0,
#                         'cuda_time_total': 0,
#                         'self_cpu_memory_usage': 0,
#                         'self_cuda_memory_usage': 0
#                     }
#                 aggregated_stats[name]['self_cpu_time_total'] += event.self_cpu_time_total
#                 aggregated_stats[name]['cpu_time_total'] += event.cpu_time_total
#                 aggregated_stats[name]['cuda_time_total'] += event.device_time_total  # Use device_time_total for CUDA time
#                 aggregated_stats[name]['self_cpu_memory_usage'] += event.self_cpu_memory_usage
#                 aggregated_stats[name]['self_cuda_memory_usage'] += event.self_device_memory_usage  # Use self_device_memory_usage

#     # Print aggregated stats per layer
#     total_cpu_time = 0
#     total_cuda_time = 0
#     total_cpu_mem = 0
#     total_cuda_mem = 0

#     for name, stats in aggregated_stats.items():
#         total_cpu_time += stats['cpu_time_total']
#         total_cuda_time += stats['cuda_time_total']
#         total_cpu_mem += stats['self_cpu_memory_usage']
#         total_cuda_mem += stats['self_cuda_memory_usage']
#         print(f"{name:<40} {stats['self_cpu_time_total']:<15.2f} {stats['cpu_time_total']:<15.2f} {stats['cuda_time_total']:<15.2f} {stats['self_cpu_memory_usage']:<20} {stats['self_cuda_memory_usage']:<20}")

#     print("-" * 150)
#     print(f"{'Total':<40} {total_cpu_time:<15.2f} {'-':<15} {total_cuda_time:<15.2f} {total_cpu_mem:<20} {total_cuda_mem:<20}")
#     print("-" * 150)
#     print(f"Total Execution Time (CPU + GPU): {total_cpu_time + total_cuda_time:.2f} us")
#     print(f"Total Memory Used (CPU + GPU): {total_cpu_mem + total_cuda_mem} bytes")
#     print("-" * 150)

#     return aggregated_stats


# # Function to select the correct data based on the model type (FFN or CNN)
# def get_data_for_model(model_type):
#     if model_type == 'FFN':
#         # Data for FFN
#         input_size = 100
#         batch_size = 64
#         inputs = torch.randn(batch_size, input_size)
#         targets = torch.randint(0, 10, (batch_size,))
#         return inputs, targets, batch_size
#     elif model_type == 'CNN':
#         # Data for CNN
#         input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
#         inputs = torch.randn(input_shape)
#         targets = torch.randint(0, 10, (64,))
#         return inputs, targets, 64

# # Define an extended Feedforward Neural Network (FFN) with more layers
# class ExtendedFFN(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
#         super(ExtendedFFN, self).__init__()

#         self.input_layer = nn.Linear(input_size, hidden_size)
#         self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
#         self.output_layer = nn.Linear(hidden_size, output_size)
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         x = self.input_layer(x)
#         x = self.relu(x)

#         # Apply hidden layers with ReLU activations
#         for hidden_layer in self.hidden_layers:
#             x = hidden_layer(x)
#             x = self.relu(x)

#         x = self.output_layer(x)
#         return x

# # Define a Convolutional Neural Network (CNN)
# class CNN(nn.Module):
#     def __init__(self):
#         super(CNN, self).__init__()
#         self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
#         self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
#         self.fc1 = nn.Linear(64 * 7 * 7, 128)
#         self.fc2 = nn.Linear(128, 10)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.conv1(x)           # Convolution Layer 1
#         x = self.relu(x)
#         x = self.pool(x)            # Pooling Layer 1
#         x = self.conv2(x)           # Convolution Layer 2
#         x = self.relu(x)
#         x = self.pool(x)            # Pooling Layer 2
#         x = x.view(-1, 64 * 7 * 7)  # Flatten
#         x = self.fc1(x)             # Fully Connected Layer 1
#         x = self.relu(x)
#         x = self.dropout(x)         # Dropout for regularization
#         x = self.fc2(x)             # Fully Connected Layer 2
#         return x

# # Run profiling on CPU and GPU for both FFN and CNN
# def main():
#     for model_type in ['CNN', 'FFN']:
#         print(f"Profiling {model_type} on CPU...")

#         # Get the correct model and data
#         if model_type == 'FFN':
#             # Using the extended FFN with specified hidden layers
#             model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
#         else:
#             model = CNN()

#         inputs, targets, batch_size = get_data_for_model(model_type)
#         dataset = TensorDataset(inputs, targets)
#         dataloader = DataLoader(dataset, batch_size=batch_size)

#         # Profile on CPU
#         profile_layer_execution(model, device='cpu', dataloader=dataloader, model_name=model_type)

#         # Profile on GPU (if available)
#         if torch.cuda.is_available():
#             print(f"Profiling {model_type} on GPU...")
#             profile_layer_execution(model, device='cuda', dataloader=dataloader, model_name=model_type)
#         else:
#             print(f"CUDA is not available for {model_type}.")

# if __name__ == "__main__":
#     main()


In [50]:
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader, TensorDataset
# import torch.profiler

# def profile_layer_execution(model, device, dataloader, model_name,warmup_iters=5):
#     model.to(device)  # Move model to the selected device
#     model.eval()  # Set the model to evaluation mode

#     # Warm-up iterations (to stabilize the device)
#     with torch.no_grad():
#         for _ in range(warmup_iters):
#             for inputs, targets in dataloader:
#                 inputs, targets = inputs.to(device), targets.to(device)
#                 model(inputs)

#     # Start profiling
#     with torch.profiler.profile(
#         activities=[
#             torch.profiler.ProfilerActivity.CPU,
#             torch.profiler.ProfilerActivity.CUDA if device == 'cuda' else None,
#         ],
#         on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log_{model_name}_{device}'),
#         record_shapes=False,
#         profile_memory=True,
#         with_stack=False
#     ) as profiler:
#         with torch.no_grad():
#             for inputs, _ in dataloader:
#                 inputs = inputs.to(device)
#                 _ = model(inputs)
#                 break  # Profile only one batch for brevity

#     # Aggregate and calculate total times
#     prof_aggregates = profiler.key_averages()
#     total_cpu_time = sum([item.cpu_time_total for item in prof_aggregates])
#     total_cuda_time = sum([item.device_time_total for item in prof_aggregates]) if device == 'cuda' else 0
#     total_time = total_cpu_time + total_cuda_time

#     # Print profiler summary
#     print(f"--- Profiling Summary for {model_name} on {device.upper()} ---")
#     print(profiler.key_averages().table(sort_by="cuda_time_total" if device == 'cuda' else "cpu_time_total", row_limit=10))
#     print(f"Total CPU Time (ms): {total_cpu_time / 1e3:.3f}")
#     if device == 'cuda':
#         print(f"Total CUDA Time (ms): {total_cuda_time / 1e3:.3f}")
#     print(f"Total Execution Time (ms): {total_time / 1e3:.3f}")
#     print("------------------------------------------------------------")


# # Function to select the correct data based on the model type (FFN or CNN)
# def get_data_for_model(model_type):
#     if model_type == 'FFN':
#         # Data for FFN
#         input_size = 100
#         batch_size = 64
#         inputs = torch.randn(batch_size, input_size)
#         targets = torch.randint(0, 10, (batch_size,))
#         return inputs, targets, batch_size
#     elif model_type == 'CNN':
#         # Data for CNN
#         input_shape = (64, 1, 28, 28)  # Example input size (like MNIST)
#         inputs = torch.randn(input_shape)
#         targets = torch.randint(0, 10, (64,))
#         return inputs, targets, 64

# # Define an extended Feedforward Neural Network (FFN) with more layers
# class ExtendedFFN(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
#         super(ExtendedFFN, self).__init__()

#         self.input_layer = nn.Linear(input_size, hidden_size)
#         self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
#         self.output_layer = nn.Linear(hidden_size, output_size)
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         x = self.input_layer(x)
#         x = self.relu(x)

#         # Apply hidden layers with ReLU activations
#         for hidden_layer in self.hidden_layers:
#             x = hidden_layer(x)
#             x = self.relu(x)

#         x = self.output_layer(x)
#         return x

# # Define a Convolutional Neural Network (CNN)
# class CNN(nn.Module):
#     def __init__(self):
#         super(CNN, self).__init__()
#         self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
#         self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
#         self.fc1 = nn.Linear(64 * 7 * 7, 128)
#         self.fc2 = nn.Linear(128, 10)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.conv1(x)           # Convolution Layer 1
#         x = self.relu(x)
#         x = self.pool(x)            # Pooling Layer 1
#         x = self.conv2(x)           # Convolution Layer 2
#         x = self.relu(x)
#         x = self.pool(x)            # Pooling Layer 2
#         x = x.view(-1, 64 * 7 * 7)  # Flatten
#         x = self.fc1(x)             # Fully Connected Layer 1
#         x = self.relu(x)
#         x = self.dropout(x)         # Dropout for regularization
#         x = self.fc2(x)             # Fully Connected Layer 2
#         return x

# # Main function to run profiling
# def main():
#     for model_type in ['CNN', 'FFN']:
#         print(f"Profiling {model_type} on CPU...")

#         # Get the correct model and data
#         if model_type == 'FFN':
#             # Using the extended FFN with specified hidden layers
#             model = ExtendedFFN(input_size=100, hidden_size=50, output_size=10, num_hidden_layers=4)
#         else:
#             model = CNN()

#         inputs, targets, batch_size = get_data_for_model(model_type)
#         dataset = TensorDataset(inputs, targets)
#         dataloader = DataLoader(dataset, batch_size=batch_size)

#         # Profile on CPU
#         profile_layer_execution(model, device='cpu', dataloader=dataloader, model_name=model_type)

#         # Profile on GPU (if available)
#         if torch.cuda.is_available():
#             print(f"Profiling {model_type} on GPU...")
#             profile_layer_execution(model, device='cuda', dataloader=dataloader, model_name=model_type)
#         else:
#             print(f"CUDA is not available for {model_type}.")

# if __name__ == "__main__":
#     main()
