## Description

This notebook is, one of the first notebooks , post our meeting.
In this notebook , I created the node classes and experimented with the Profiler
methods for getting the complete profile of the forward pass.

Here we have also tested the node class and its other functionalities


---
**Note- There were many .ipynb notebooks where I was testing and developing each of the classes.

In [None]:
import os
import torch
from typing import Callable, Any
import threading
import queue


class Node:
    """
    Represents an immutable computational node in the system.
    A node is defined by its CPU cores and GPU device, and tasks are strictly executed on this compute.

    This parallel version assigns a dedicated worker thread and a queue of tasks to each node.
    """

    def __init__(self, node_id: str, cpus=None, gpu=None):
        """
        Initialize a Node.
        :param node_id: Unique identifier for the node (e.g., CPU-0, GPU-0-CPU-1).
        :param cpus: List of CPU cores assigned to this node.
        :param gpu: GPU device ID assigned to this node (if any).
        """
        self._node_id = node_id
        self._cpus = tuple(cpus or [])  # Immutable tuple of CPU cores
        self._gpu = gpu  # Immutable GPU device ID
        self._original_affinity = os.sched_getaffinity(0)  # System's original CPU affinity

        # Task queue: tuples of (task_function, args, kwargs, result_queue)
        self._task_queue = queue.Queue()
        self._stop_signal = False

        # Worker thread for parallel task execution
        self._worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
        self._worker_thread.start()

    @property
    def node_id(self):
        """Get the node's unique identifier."""
        return self._node_id

    @property
    def cpus(self):
        """Get the CPU cores assigned to the node."""
        return self._cpus

    @property
    def gpu(self):
        """Get the GPU device ID assigned to the node (if any)."""
        return self._gpu

    def assign_task(self, task_function: Callable, *args, **kwargs) -> Any:
        """
        Assign a task to this node to be executed on its compute resources.
        Returns a queue from which the caller can retrieve the task's result.

        :param task_function: The function representing the task to execute.
        :param args: Positional arguments for the task function.
        :param kwargs: Keyword arguments for the task function.
        :return: A result queue to get the task's result.
        """
        result_queue = queue.Queue(maxsize=1)
        self._task_queue.put((task_function, args, kwargs, result_queue))
        return result_queue

    def stop(self):
        """
        Signal the node to stop processing tasks and terminate its worker thread.
        Processes any already queued tasks before stopping.
        """
        self._stop_signal = True
        # Put a sentinel None to unblock the worker
        self._task_queue.put(None)
        self._worker_thread.join()

    def _worker_loop(self):
        """
        Worker thread loop:
        Continuously fetch tasks from the queue and executes them.
        Stops upon encountering a sentinel (None) or if _stop_signal is True.
        """
        while not self._stop_signal:
            item = self._task_queue.get()
            if item is None:
                # Sentinel or stop requested
                break
            task_function, args, kwargs, result_queue = item
            try:
                self._set_context()
                result = task_function(*args, **kwargs)
            except Exception as e:
                result = e
            finally:
                self._reset_context()

            # Return the result to the caller
            result_queue.put(result)

    def _set_context(self):
        """Set the CPU/GPU context for task execution."""
        if self._gpu is not None:
            torch.cuda.set_device(self._gpu)
        if self._cpus:
            os.sched_setaffinity(0, self._cpus)

    def _reset_context(self):
        """Reset the CPU/GPU context to its original state."""
        os.sched_setaffinity(0, self._original_affinity)
        if self._gpu is not None:
            # Reset GPU device to default
            torch.cuda.set_device(torch.cuda.current_device())

    @staticmethod
    def discover_nodes():
        """
        Discover system resources and create a set of nodes.
        Each CPU core and GPU resource combination forms a distinct node.
        :return: List of Node objects.
        """
        nodes = []
        cpu_cores = os.cpu_count()  # Total number of CPU cores
        available_gpus = torch.cuda.device_count()  # Total number of GPUs

        # Create CPU-only nodes: one node per CPU core
        for core_id in range(cpu_cores):
            nodes.append(Node(node_id=f"CPU-{core_id}", cpus=[core_id]))

        # Create GPU-accelerated nodes: pair each GPU with each CPU core
        for gpu_id in range(available_gpus):
            for core_id in range(cpu_cores):
                nodes.append(Node(node_id=f"GPU-{gpu_id}-CPU-{core_id}", cpus=[core_id], gpu=gpu_id))

        return nodes

    def __repr__(self):
        return f"Node(node_id={self._node_id}, cpus={self._cpus}, gpu={self._gpu})"


In [None]:
import time
import random
# from parallel_node import Node  # Adjust this import based on your directory structure


def simulated_work(duration: float, node_info: str) -> str:
    """
    A dummy task that simulates work by sleeping for 'duration' seconds.
    Returns a string summarizing the work done.
    """
    time.sleep(duration)
    return f"Node {node_info} completed work in {duration:.2f}s"


def main_1():
    # Discover all available nodes
    nodes = Node.discover_nodes()

    if not nodes:
        print("No nodes discovered. Check your system configuration.")
        return

    print(f"Discovered {len(nodes)} nodes:")
    for n in nodes:
        print(n)

    # Assign tasks to all nodes
    # Each task will have a random duration between 0.5 and 3.0 seconds
    result_queues = []
    for node in nodes:
        duration = random.uniform(0.5, 3.0)
        # Assign a task to this node
        result_q = node.assign_task(simulated_work, duration=duration, node_info=node.node_id)
        result_queues.append((node, result_q))

    start_time = time.time()

    # Retrieve results from all nodes
    results = []
    for node, q in result_queues:
        result = q.get()  # Blocking call until the node finishes the task
        results.append((node, result))

    end_time = time.time()

    # Print all results
    print("\nResults:")
    for node, res in results:
        print(f"{node.node_id}: {res}")

    elapsed = end_time - start_time
    print(f"All tasks completed in {elapsed:.2f}s")

    # Since tasks are run in parallel, the total time should be close to the longest single task,
    # rather than the sum of all tasks.

    # Stop all nodes cleanly
    for node in nodes:
        node.stop()


# if __name__ == "__main__":
main_1()


Discovered 4 nodes:
Node(node_id=CPU-0, cpus=(0,), gpu=None)
Node(node_id=CPU-1, cpus=(1,), gpu=None)
Node(node_id=GPU-0-CPU-0, cpus=(0,), gpu=0)
Node(node_id=GPU-0-CPU-1, cpus=(1,), gpu=0)

Results:
CPU-0: Node CPU-0 completed work in 1.09s
CPU-1: Node CPU-1 completed work in 2.86s
GPU-0-CPU-0: Node GPU-0-CPU-0 completed work in 0.92s
GPU-0-CPU-1: Node GPU-0-CPU-1 completed work in 2.29s
All tasks completed in 2.87s


In [None]:
import os
import torch
import torch.nn as nn
import torch.profiler
import pandas as pd
from typing import Optional
from collections import defaultdict
import re
# from collections import defaultdict

class Profiler:
    """
    A profiler that operates in two modes:
    - init: Gathers full per-layer profiling (including memory) and updates ProfileDB.
    - runtime: Gathers lightweight timing data and logs it, without updating ProfileDB.

    Forward hooks + record_function are used to identify and measure each layer's performance.
    """

    def __init__(self, mode: str, profile_db_path: str = 'profiling_results.csv', log_dir: str = 'logs'):
        assert mode in ['init', 'runtime'], "mode must be 'init' or 'runtime'"
        self.mode = mode
        self.profile_db_path = profile_db_path
        self.log_dir = log_dir

        os.makedirs(log_dir, exist_ok=True)

        # Columns for ProfileDB
        db_columns = [
            'Model', 'Layer', 'Compute', 'Self CPU (us)', 'CPU Total (us)',
            'CUDA Total (us)', 'Self CPU Mem (bytes)', 'Self CUDA Mem (bytes)',
            'Total Execution Time (us)', 'Total Memory Used (bytes)'
        ]
        # Load or create ProfileDB
        if os.path.exists(self.profile_db_path):
            self.profile_db = pd.read_csv(self.profile_db_path)
        else:
            self.profile_db = pd.DataFrame(columns=db_columns)

        # Runtime results CSV
        self.runtime_csv = os.path.join(self.log_dir, 'runtime_results.csv')
        if not os.path.exists(self.runtime_csv):
            rt_columns = ['Model', 'Layer', 'Compute', 'Execution Time (us)']
            pd.DataFrame(columns=rt_columns).to_csv(self.runtime_csv, index=False)

    def _register_hooks(self, model: nn.Module):
        """
        Register forward hooks to mark each layer with record_function.
        """
        def hook_wrapper(layer_name):
            def hook(module, input, output):
                with torch.profiler.record_function(layer_name):
                    pass
            return hook

        for idx, (name, layer) in enumerate(model.named_modules()):
            if not isinstance(layer, nn.Sequential) and not isinstance(layer, nn.ModuleList) and layer != model:
                layer.register_forward_hook(hook_wrapper(f"{name}_{idx}"))

    def profile_model(self, model: nn.Module, dataloader, node, model_name: str, warmup_iters: int = 5):
        """
        Profile the model on the given node. Init mode updates the DB, runtime mode logs only.
        """
        def profiling_task():
            device = torch.device(f"cuda:{node.gpu}" if node.gpu is not None and torch.cuda.is_available() else "cpu")
            model.to(device)

            if self.mode == 'init':
                # Warm-up in init mode
                with torch.no_grad():
                    count = 0
                    for inputs, targets in dataloader:
                        inputs, targets = inputs.to(device), targets.to(device)
                        model(inputs)
                        count += 1
                        if count >= warmup_iters:
                            break

                self._profile_init(model, dataloader, node, model_name, device)
            else:
                self._profile_runtime(model, dataloader, node, model_name, device)

        # Run the profiling task on the node's thread
        result_queue = node.assign_task(profiling_task)
        result_queue.get()  # Wait for task completion

    def _profile_init(self, model, dataloader, node, model_name, device):
        self._register_hooks(model)
        with torch.profiler.profile(
            activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
            profile_memory=True
            # with_modules=True
        ) as profiler:
            with torch.no_grad():
                for step, (inputs, targets) in enumerate(dataloader):
                    if step >= 1:
                        break  # Profile only one batch for simplicity
                    inputs, targets = inputs.to(device), targets.to(device)
                    model(inputs)
                    profiler.step()

        stats = self._process_profiler_events(profiler, model, node, runtime=False)
        # print(stats.keys())
        self._update_profile_db(stats, model_name, node, runtime=False)

    def _profile_runtime(self, model, dataloader, node, model_name, device):
        # Runtime profiling: no DB update, only log to runtime_results.csv
        self._register_hooks(model)

        with torch.no_grad():
            for step, (inputs, targets) in enumerate(dataloader):
                if step >= 1:
                    break  # Profile only one batch for simplicity
                inputs, targets = inputs.to(device), targets.to(device)

                with torch.profiler.profile(
                    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
                    # with_modules=True
                ) as profiler:
                    model(inputs)
                    profiler.step()

                stats = self._process_profiler_events(profiler, model, node, runtime=True)
                print(stats.keys())
                self._append_runtime_csv(stats, model_name, node)

# from collections import defaultdict

    def get_granular_layer_names(self, model):
        """
        Extract only the most granular (leaf) layers from the model.
        """
        granular_layers = {}

        def is_leaf_layer(layer):
            return len(list(layer.children())) == 0  # A leaf layer has no child modules

        for name, module in model.named_modules():
            if is_leaf_layer(module) and name:  # Exclude the root module (empty name)
                granular_layers[name] = module

        return granular_layers


    # def _process_profiler_events(self, profiler, model, node, runtime=False):
    #     """
    #     Processes profiler events, keeping only granular layers and aggregating their data.
    #     """
    #     # Step 1: Extract granular layers
    #     granular_layers = {}

    #     def is_leaf_layer(layer):
    #         return len(list(layer.children())) == 0  # A leaf layer has no child modules

    #     for name, module in model.named_modules():
    #         if is_leaf_layer(module) and name:  # Exclude the root module (empty name)
    #             granular_layers[name] = module

    #     # Step 2: Initialize Aggregated Stats
    #     aggregated_stats = {
    #         'forward_pass': {
    #             'self_cpu_time_total': 0,
    #             'cpu_time_total': 0,
    #             'cuda_time_total': 0,
    #             'self_cpu_memory_usage': 0,
    #             'self_cuda_memory_usage': 0,
    #             'compute': node.node_id
    #         },
    #         'misc': {
    #             'self_cpu_time_total': 0,
    #             'cpu_time_total': 0,
    #             'cuda_time_total': 0,
    #             'self_cpu_memory_usage': 0,
    #             'self_cuda_memory_usage': 0,
    #             'compute': node.node_id
    #         }
    #     }

    #     unmatched_events = []  # For debugging purposes
    #     # profile_events = profiler.profiler.events()
    #     # Step 3: Process Events
    #     profiler_events = [event for event in profiler.events()]
    #     print(profiler_events)
    #     for event in profiler_events:
    #         matched = False
    #         print("***",event.name)

    #         # Handle Root-Level Forward Pass
    #         if event.name == "":
    #             for key in aggregated_stats['forward_pass']:
    #                 aggregated_stats['forward_pass'][key] += getattr(event, key, 0)
    #             matched = True

    #         # Match Granular Layers Only
    #         for name in granular_layers:
    #             if event.name == name or f"{name}_" in event.name:
    #                 if name not in aggregated_stats:
    #                     aggregated_stats[name] = {
    #                         'self_cpu_time_total': 0,
    #                         'cpu_time_total': 0,
    #                         'cuda_time_total': 0,
    #                         'self_cpu_memory_usage': 0,
    #                         'self_cuda_memory_usage': 0,
    #                         'compute': node.node_id
    #                     }

    #                 # Aggregate Stats for This Layer
    #                 aggregated_stats[name]['self_cpu_time_total'] += event.self_cpu_time_total
    #                 aggregated_stats[name]['cpu_time_total'] += event.cpu_time_total
    #                 aggregated_stats[name]['cuda_time_total'] += event.device_time_total

    #                 if not runtime:
    #                     aggregated_stats[name]['self_cpu_memory_usage'] += event.self_cpu_memory_usage
    #                     aggregated_stats[name]['self_cuda_memory_usage'] += event.self_device_memory_usage

    #                 # Aggregate into Forward Pass
    #                 aggregated_stats['forward_pass']['self_cpu_time_total'] += event.self_cpu_time_total
    #                 aggregated_stats['forward_pass']['cpu_time_total'] += event.cpu_time_total
    #                 aggregated_stats['forward_pass']['cuda_time_total'] += event.device_time_total

    #                 if not runtime:
    #                     aggregated_stats['forward_pass']['self_cpu_memory_usage'] += event.self_cpu_memory_usage
    #                     aggregated_stats['forward_pass']['self_cuda_memory_usage'] += event.self_device_memory_usage

    #                 matched = True
    #                 break  # Avoid duplicate matches

    #         # Handle Unmatched Events
    #         if not matched:
    #             for key in aggregated_stats['misc']:
    #                 aggregated_stats['misc'][key] += getattr(event, key, 0)
    #             unmatched_events.append(event)

    #     return aggregated_stats




# import re
# from collections import defaultdict
# import torch.nn as nn

    # def _process_profiler_events(self, profiler, model, node, runtime=False):
    #     """
    #     A profiling-aggregation function that:
    #       1) Strips underscore-suffixes from events (e.g., `relu._7` -> `relu`)
    #       2) Puts recognized named modules into separate rows
    #       3) Lumps everything else (including `aten::`, `[memory]`, etc.) into `misc`
    #       4) If an empty-name (root) event `""` is found, it goes to `forward_pass`
    #     """

    #     # ---------------------------------------------
    #     # 1. Build a set of recognized layer names
    #     #    E.g., all named_modules in the model
    #     #    You could refine this to only leaf layers if you prefer.
    #     # ---------------------------------------------
    #     recognized_layers = set()
    #     for name, module in model.named_modules():
    #         if name:  # skip root module with empty name
    #             recognized_layers.add(name)

    #     # Optionally, add top-level "input_layer", "output_layer", etc. if they are not children
    #     # recognized_layers.update(["input_layer", "output_layer", "relu", "hidden_layers.0", ...])
    #     # ^ Only if needed. Usually model.named_modules() is enough.

    #     # ---------------------------------------------
    #     # 2. Prepare final aggregated stats placeholders
    #     # ---------------------------------------------
    #     aggregated_stats = {
    #         'forward_pass': {
    #             'self_cpu_time_total': 0,
    #             'cpu_time_total': 0,
    #             'cuda_time_total': 0,
    #             'self_cpu_memory_usage': 0,
    #             'self_cuda_memory_usage': 0,
    #             'compute': node.node_id
    #         },
    #         'misc': {
    #             'self_cpu_time_total': 0,
    #             'cpu_time_total': 0,
    #             'cuda_time_total': 0,
    #             'self_cpu_memory_usage': 0,
    #             'self_cuda_memory_usage': 0,
    #             'compute': node.node_id
    #         }
    #     }

    #     # ---------------------------------------------
    #     # 3. Helper to strip underscore/dot + digits
    #     #    e.g., "relu._7" -> "relu", "hidden_layers.0_3" -> "hidden_layers.0"
    #     # ---------------------------------------------
    #     def strip_underscore_suffix(layer_name: str) -> str:
    #         return re.sub(r'(\.|_)\d+$', '', layer_name)

    #     # ---------------------------------------------
    #     # 4. Iterate over all profiler events
    #     # ---------------------------------------------
    #     for event in profiler.events():

    #         # (A) Root-level event: name == ""
    #         if event.name == "":
    #             # Accumulate to forward_pass
    #             aggregated_stats['forward_pass']['self_cpu_time_total'] += event.self_cpu_time_total
    #             aggregated_stats['forward_pass']['cpu_time_total'] += event.cpu_time_total
    #             aggregated_stats['forward_pass']['cuda_time_total'] += event.device_time_total
    #             if not runtime:
    #                 aggregated_stats['forward_pass']['self_cpu_memory_usage'] += event.self_cpu_memory_usage
    #                 aggregated_stats['forward_pass']['self_cuda_memory_usage'] += event.self_device_memory_usage
    #             continue

    #         # (B) Otherwise, see if we have a recognized layer
    #         base_name = strip_underscore_suffix(event.name)  # e.g. "hidden_layers.0_3" -> "hidden_layers.0"

    #         if base_name in recognized_layers:
    #             # If we haven't seen this base_name in aggregated_stats, init it
    #             if base_name not in aggregated_stats:
    #                 aggregated_stats[base_name] = {
    #                     'self_cpu_time_total': 0,
    #                     'cpu_time_total': 0,
    #                     'cuda_time_total': 0,
    #                     'self_cpu_memory_usage': 0,
    #                     'self_cuda_memory_usage': 0,
    #                     'compute': node.node_id
    #                 }

    #             # Accumulate stats here
    #             aggregated_stats[base_name]['self_cpu_time_total'] += event.self_cpu_time_total
    #             aggregated_stats[base_name]['cpu_time_total'] += event.cpu_time_total
    #             aggregated_stats[base_name]['cuda_time_total'] += event.device_time_total

    #             if not runtime:
    #                 aggregated_stats[base_name]['self_cpu_memory_usage'] += event.self_cpu_memory_usage
    #                 aggregated_stats[base_name]['self_cuda_memory_usage'] += event.self_device_memory_usage

    #         else:
    #             # (C) Not recognized => goes to misc
    #             aggregated_stats['misc']['self_cpu_time_total'] += event.self_cpu_time_total
    #             aggregated_stats['misc']['cpu_time_total'] += event.cpu_time_total
    #             aggregated_stats['misc']['cuda_time_total'] += event.device_time_total
    #             if not runtime:
    #                 aggregated_stats['misc']['self_cpu_memory_usage'] += event.self_cpu_memory_usage
    #                 aggregated_stats['misc']['self_cuda_memory_usage'] += event.self_device_memory_usage

    #     return aggregated_stats

# import re

    def _process_profiler_events(self, profiler, model, node, runtime=False):
        """
        A profiling-aggregation function that:
          1) Strips underscore-suffixes from events (e.g., `relu._7` -> `relu`)
          2) Puts recognized named modules into separate rows
          3) Lumps everything else (including `aten::`, `[memory]`, etc.) into `misc`
          4) If an empty-name (root) event "" is found, that becomes 'forward_pass'.
            Otherwise, we sum all recognized layers + misc into 'forward_pass'.
        """

        recognized_layers = set()
        for name, module in model.named_modules():
            if name:  # skip the empty root name
                recognized_layers.add(name)

        # Prepare final stats with placeholders
        aggregated_stats = {
            'forward_pass': {
                'self_cpu_time_total': 0,
                'cpu_time_total': 0,
                'cuda_time_total': 0,
                'self_cpu_memory_usage': 0,
                'self_cuda_memory_usage': 0,
                'compute': node.node_id
            },
            'misc': {
                'self_cpu_time_total': 0,
                'cpu_time_total': 0,
                'cuda_time_total': 0,
                'self_cpu_memory_usage': 0,
                'self_cuda_memory_usage': 0,
                'compute': node.node_id
            }
        }

        # Helper to remove underscore/dot + digits at the end
        def strip_underscore_suffix(layer_name: str) -> str:
            return re.sub(r'(\.|_)\d+$', '', layer_name)

        found_root_event = False

        # 1. Collect events
        events = list(profiler.events())

        # 2. Process each event
        for event in events:
            # (A) Root-level event name == ""
            if event.name == "":
                found_root_event = True  # Mark that we saw a real root event
                aggregated_stats['forward_pass']['self_cpu_time_total'] += event.self_cpu_time_total
                aggregated_stats['forward_pass']['cpu_time_total'] += event.cpu_time_total
                aggregated_stats['forward_pass']['cuda_time_total'] += event.device_time_total
                if not runtime:
                    aggregated_stats['forward_pass']['self_cpu_memory_usage'] += event.self_cpu_memory_usage
                    aggregated_stats['forward_pass']['self_cuda_memory_usage'] += event.self_device_memory_usage
                continue

            # (B) Otherwise, see if the stripped name is recognized
            base_name = strip_underscore_suffix(event.name)

            if base_name in recognized_layers:
                if base_name not in aggregated_stats:
                    aggregated_stats[base_name] = {
                        'self_cpu_time_total': 0,
                        'cpu_time_total': 0,
                        'cuda_time_total': 0,
                        'self_cpu_memory_usage': 0,
                        'self_cuda_memory_usage': 0,
                        'compute': node.node_id
                    }
                # Accumulate stats
                aggregated_stats[base_name]['self_cpu_time_total'] += event.self_cpu_time_total
                aggregated_stats[base_name]['cpu_time_total'] += event.cpu_time_total
                aggregated_stats[base_name]['cuda_time_total'] += event.device_time_total
                if not runtime:
                    aggregated_stats[base_name]['self_cpu_memory_usage'] += event.self_cpu_memory_usage
                    aggregated_stats[base_name]['self_cuda_memory_usage'] += event.self_device_memory_usage

            else:
                # (C) Lump all else into misc
                aggregated_stats['misc']['self_cpu_time_total'] += event.self_cpu_time_total
                aggregated_stats['misc']['cpu_time_total'] += event.cpu_time_total
                aggregated_stats['misc']['cuda_time_total'] += event.device_time_total
                if not runtime:
                    aggregated_stats['misc']['self_cpu_memory_usage'] += event.self_cpu_memory_usage
                    aggregated_stats['misc']['self_cuda_memory_usage'] += event.self_device_memory_usage

        # 3. If we did NOT find a root event, sum recognized layers + misc => forward_pass
        if not found_root_event:
            # Collect all keys except forward_pass and misc themselves
            all_layer_keys = [k for k in aggregated_stats.keys() if k not in ('forward_pass', 'misc')]

            # Sum recognized layers
            for layer_key in all_layer_keys:
                aggregated_stats['forward_pass']['self_cpu_time_total'] += aggregated_stats[layer_key]['self_cpu_time_total']
                aggregated_stats['forward_pass']['cpu_time_total'] += aggregated_stats[layer_key]['cpu_time_total']
                aggregated_stats['forward_pass']['cuda_time_total'] += aggregated_stats[layer_key]['cuda_time_total']
                if not runtime:
                    aggregated_stats['forward_pass']['self_cpu_memory_usage'] += aggregated_stats[layer_key]['self_cpu_memory_usage']
                    aggregated_stats['forward_pass']['self_cuda_memory_usage'] += aggregated_stats[layer_key]['self_cuda_memory_usage']

            # Sum misc
            aggregated_stats['forward_pass']['self_cpu_time_total'] += aggregated_stats['misc']['self_cpu_time_total']
            aggregated_stats['forward_pass']['cpu_time_total'] += aggregated_stats['misc']['cpu_time_total']
            aggregated_stats['forward_pass']['cuda_time_total'] += aggregated_stats['misc']['cuda_time_total']
            if not runtime:
                aggregated_stats['forward_pass']['self_cpu_memory_usage'] += aggregated_stats['misc']['self_cpu_memory_usage']
                aggregated_stats['forward_pass']['self_cuda_memory_usage'] += aggregated_stats['misc']['self_cuda_memory_usage']

        return aggregated_stats


    def _update_profile_db(self, stats, model_name, node, runtime=False):
        """
        Updates the ProfileDB with new profiling stats.
        """
        if runtime:
            # No DB update in runtime mode
            return

        for layer_name, data in stats.items():
            total_time = data['cpu_time_total'] + data['cuda_time_total']
            total_mem = data['self_cpu_memory_usage'] + data['self_cuda_memory_usage']

            row = {
                'Model': model_name,
                'Layer': layer_name,
                'Compute': data['compute'],
                'Self CPU (us)': data['self_cpu_time_total'],
                'CPU Total (us)': data['cpu_time_total'],
                'CUDA Total (us)': data['cuda_time_total'],
                'Self CPU Mem (bytes)': data['self_cpu_memory_usage'],
                'Self CUDA Mem (bytes)': data['self_cuda_memory_usage'],
                'Total Execution Time (us)': total_time,
                'Total Memory Used (bytes)': total_mem
            }

            # Add or update the entry in the database
            self.profile_db = self._update_or_add_record(self.profile_db, row)

        # Debugging: Print ProfileDB before saving
        # print("Updated ProfileDB:\n", self.profile_db.to_string(index=False))

        # Save the updated ProfileDB to the file
        self.profile_db.to_csv(self.profile_db_path, index=False)
        print(f"Profiling for model {model_name} completed. Data saved to {self.profile_db_path}.")


    def _update_or_add_record(self, df: pd.DataFrame, row: dict):
        mask = (df['Model'] == row['Model']) & \
               (df['Layer'] == row['Layer']) & \
               (df['Compute'] == row['Compute'])

        if not df[mask].empty:
            # Update existing record if new metrics are greater
            existing_time = df.loc[mask, 'Total Execution Time (us)'].max()
            existing_mem = df.loc[mask, 'Total Memory Used (bytes)'].max()

            update_needed = (
                row['Total Execution Time (us)'] > existing_time
            )

            if update_needed:
                for k, v in row.items():
                    df.loc[mask, k] = v
        else:
            # Insert new record
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

        return df

    def _append_runtime_csv(self, stats, model_name, node):
        new_rows = []
        for layer_name, data in stats.items():
            exec_time = data['cpu_time_total'] + data['cuda_time_total']
            new_rows.append({
                'Model': model_name,
                'Layer': layer_name,
                'Compute': data['compute'],
                'Execution Time (us)': exec_time
            })
        if new_rows:
            rt_df = pd.read_csv(self.runtime_csv)
            rt_df = pd.concat([rt_df, pd.DataFrame(new_rows)], ignore_index=True)
            rt_df.to_csv(self.runtime_csv, index=False)
            print(f"Runtime profiling data appended to {self.runtime_csv}.")

    def get_profile_db(self) -> pd.DataFrame:
        return self.profile_db

    def print_profile_db(self):
        if self.profile_db.empty:
            print("ProfileDB is empty.")
        else:
            print("ProfileDB:\n", self.profile_db.to_string(index=False))

    def print_runtime_log(self, model_name: str, node_id: str):
        # Filter runtime CSV by model and node
        if not os.path.exists(self.runtime_csv):
            print("No runtime log CSV found.")
            return
        rt_df = pd.read_csv(self.runtime_csv)
        filtered = rt_df[(rt_df['Model'] == model_name) & (rt_df['Compute'] == node_id)]
        if filtered.empty:
            print(f"No runtime logs found for {model_name} on {node_id}")
        else:
            print(f"Runtime Logs for {model_name} on {node_id}:\n", filtered.to_string(index=False))


In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from typing import Optional
import torch.nn.functional as F
import torchvision.models as models


# Assuming Node and Profiler classes are already imported.


# 🏗️ Extended Feedforward Neural Network (FFN)
class ExtendedFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_hidden_layers=4):
        super(ExtendedFFN, self).__init__()

        self.input_layer = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden_layers)])
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)

        # Apply hidden layers with ReLU activations
        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = self.relu(x)

        x = self.output_layer(x)
        return x


# 🏗️ CNN Block for Complex CNN
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(CNNBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

    def forward(self, x):
        return self.block(x)


# 🏗️ Complex CNN
class ComplexCNN(nn.Module):
    def __init__(self):
        super(ComplexCNN, self).__init__()
        self.block1 = CNNBlock(1, 16)
        self.block2 = CNNBlock(16, 32)
        self.block3 = CNNBlock(32, 64)
        self.fc1 = nn.Linear(64 * 3 * 3, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


# 🏗️ ResNet18 for Image Classification
class CustomResNet18(nn.Module):
    def __init__(self, num_classes=10):
        super(CustomResNet18, self).__init__()
        self.resnet = models.resnet18(pretrained=False)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)  # Adjust input channels to 1
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet(x)


# 🛠️ Function to Run Profiling on a Node
def run_profiling_on_node(profiler, model, dataloader, node, model_name, run_type):
    print(f"\n{'-' * 60}")
    print(f"{run_type.upper()} PROFILING: Model '{model_name}' on Node '{node.node_id}'")
    print(f"{'-' * 60}")
    profiler.profile_model(model, dataloader, node, model_name)


# 🚀 Main Function
def create_DB():
    # 📊 Define Datasets and Dataloaders for Each Model
    datasets = {
        "SimpleFFN": TensorDataset(
            torch.randn(32, 28 * 28),  # 32 samples, flattened 28x28 images
            torch.randint(0, 10, (32,))  # 32 labels (0-9)
        ),
        "ExtendedFFN": TensorDataset(
            torch.randn(32, 28 * 28),  # Same as SimpleFFN
            torch.randint(0, 10, (32,))
        ),
        "ComplexCNN": TensorDataset(
            torch.randn(32, 1, 28, 28),  # 32 samples, 1 channel, 28x28 images
            torch.randint(0, 10, (32,))
        ),
        "ResNet18": TensorDataset(
            torch.randn(32, 1, 224, 224),  # 32 samples, 1 channel, 224x224 images
            torch.randint(0, 10, (32,))
        )
    }

    dataloaders = {name: DataLoader(dataset, batch_size=8) for name, dataset in datasets.items()}

    # 🏗️ Define Models
    models = {
        "ExtendedFFN": ExtendedFFN(input_size=28 * 28, hidden_size=256, output_size=10, num_hidden_layers=3),
        # "ComplexCNN": ComplexCNN(),
        "ResNet18": CustomResNet18(num_classes=10)
    }

    # Discover nodes
    nodes = Node.discover_nodes()
    if not nodes:
        print("No nodes found! Exiting.")
        return

    print(f"Discovered {len(nodes)} nodes:")
    for node in nodes:
        print(node)

    # Profilers for init and runtime
    init_profiler = Profiler(mode='init', profile_db_path='profiling_results.csv', log_dir='logs')
    runtime_profiler = Profiler(mode='runtime', profile_db_path='profiling_results.csv', log_dir='logs')

    # Perform profiling on each model and each node
    for run_idx in range(3):  # 2-3 runs for each
        for model_name, model in models.items():
            for node in nodes:
                run_profiling_on_node(init_profiler, model, dataloaders[model_name], node, model_name, 'init')

    print(f"\n{'=' * 80}")
    print("FINAL INIT PROFILEDB:")
    print(f"{'=' * 80}")
    init_profiler.print_profile_db()

    print(f"\n{'=' * 80}")
    print("RUNTIME PROFILING ON ALL MODELS AND NODES")
    print(f"{'=' * 80}")
    for model_name, model in models.items():
        for node in nodes:
            run_profiling_on_node(runtime_profiler, model, dataloaders[model_name], node, model_name, 'runtime')

    print(f"\n{'=' * 80}")
    print("FINAL RUNTIME LOGS:")
    print(f"{'=' * 80}")
    for model_name in models.keys():
        for node in nodes:
            runtime_profiler.print_runtime_log(model_name, node.node_id)


# if __name__ == "__main__":
create_DB()




Discovered 4 nodes:
Node(node_id=CPU-0, cpus=(0,), gpu=None)
Node(node_id=CPU-1, cpus=(1,), gpu=None)
Node(node_id=GPU-0-CPU-0, cpus=(0,), gpu=0)
Node(node_id=GPU-0-CPU-1, cpus=(1,), gpu=0)

------------------------------------------------------------
INIT PROFILING: Model 'ExtendedFFN' on Node 'CPU-0'
------------------------------------------------------------


  df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)


Profiling for model ExtendedFFN completed. Data saved to profiling_results.csv.

------------------------------------------------------------
INIT PROFILING: Model 'ExtendedFFN' on Node 'CPU-1'
------------------------------------------------------------
Profiling for model ExtendedFFN completed. Data saved to profiling_results.csv.

------------------------------------------------------------
INIT PROFILING: Model 'ExtendedFFN' on Node 'GPU-0-CPU-0'
------------------------------------------------------------
Profiling for model ExtendedFFN completed. Data saved to profiling_results.csv.

------------------------------------------------------------
INIT PROFILING: Model 'ExtendedFFN' on Node 'GPU-0-CPU-1'
------------------------------------------------------------
Profiling for model ExtendedFFN completed. Data saved to profiling_results.csv.

------------------------------------------------------------
INIT PROFILING: Model 'ResNet18' on Node 'CPU-0'
--------------------------------

  rt_df = pd.concat([rt_df, pd.DataFrame(new_rows)], ignore_index=True)


dict_keys(['forward_pass', 'misc', 'resnet.conv1', 'resnet.bn1', 'resnet.relu', 'resnet.maxpool', 'resnet.layer1.0.conv1', 'resnet.layer1.0.bn1', 'resnet.layer1.0.relu', 'resnet.layer1.0.conv2', 'resnet.layer1.0.bn2', 'resnet.layer1.0', 'resnet.layer1.1.conv1', 'resnet.layer1.1.bn1', 'resnet.layer1.1.relu', 'resnet.layer1.1.conv2', 'resnet.layer1.1.bn2', 'resnet.layer1.1', 'resnet.layer2.0.conv1', 'resnet.layer2.0.bn1', 'resnet.layer2.0.relu', 'resnet.layer2.0.conv2', 'resnet.layer2.0.bn2', 'resnet.layer2.0.downsample.0', 'resnet.layer2.0.downsample.1', 'resnet.layer2.0', 'resnet.layer2.1.conv1', 'resnet.layer2.1.bn1', 'resnet.layer2.1.relu', 'resnet.layer2.1.conv2', 'resnet.layer2.1.bn2', 'resnet.layer2.1', 'resnet.layer3.0.conv1', 'resnet.layer3.0.bn1', 'resnet.layer3.0.relu', 'resnet.layer3.0.conv2', 'resnet.layer3.0.bn2', 'resnet.layer3.0.downsample.0', 'resnet.layer3.0.downsample.1', 'resnet.layer3.0', 'resnet.layer3.1.conv1', 'resnet.layer3.1.bn1', 'resnet.layer3.1.relu', 'resnet