In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import time
from torch.utils.data import DataLoader, TensorDataset
from tabulate import tabulate
from torchinfo import summary

from lib.cnnae_fully_convolutional import createLevel3FullyConvDropoutNet

%config InlineBackend.figure_formats = ['svg']
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# load data
path_to_data = "./data/reference/"

X_test = np.load(f"{path_to_data}X.dat_test.npy")
Y_test = np.load(f"{path_to_data}Y.dat_test.npy")

In [4]:
X_test_tensor = torch.tensor(X_test[:, np.newaxis], dtype=torch.float32, device=device)
Y_test_tensor = torch.tensor(Y_test[:, np.newaxis], dtype=torch.float32, device=device)

In [5]:
# create dataset
test_data = TensorDataset(X_test_tensor, Y_test_tensor)

batchsize = 1024
# batchsize = test_data.tensors[0].shape[0]
test_loader = DataLoader(test_data, batch_size=batchsize, shuffle=True, drop_last=True)

In [6]:
# Create FCNN
class LargeNNv2(nn.Module):
    def __init__(self):
        super(LargeNNv2, self).__init__()
        self.fc1 = nn.Linear(49, 1235)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(1235, 768)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(768, 532)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(532, 149)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(149, 98)
        self.relu5 = nn.ReLU()
        self.fc6 = nn.Linear(98, 49)  # Output layer

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.relu3(self.fc3(x))
        x = self.relu4(self.fc4(x))
        x = self.relu5(self.fc5(x))
        x = self.fc6(x)
        return x

In [7]:
# Load models
channel_configuration = (15, 24, 33)
fcae, _, _ = createLevel3FullyConvDropoutNet(channel_configuration)

fcae.load_state_dict(
    torch.load(
        "./archive/Level3FullyConvDropoutNet_99.6498/net.pt",
        weights_only=True,
    )
)

fcnn = LargeNNv2()

fcnn.load_state_dict(
    torch.load("./reference/model/collaboratory/m_300.pth", weights_only=True)
)

# fcnn2 = LargeNNv2()

# fcnn2.load_state_dict(
#     torch.load("./reference/model/collaboratory/ms-13-collab.pth", weights_only=True)
# )

<All keys matched successfully>

In [8]:
def cuda_event_measure_per_sample(model, test_loader):
    """
    Misst die durchschnittliche Inferenzzeit pro Datenpunkt eines Modells auf der GPU.

    Args:
        model: Das zu testende Modell (z.B. LargeNNv2 oder Level3FullyConvDropoutNet).
        test_loader: Dataloader mit Testdaten.

    Returns:
        avg_time_per_sample: Durchschnittliche Inferenzzeit pro Datenpunkt in Mikrosekunden.
        runtimes_per_sample: Liste der gemessenen Zeiten pro Datenpunkt.
    """
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    runtimes_per_sample = []

    # Modell vorbereiten
    model.eval()
    model.to(device)
    is_fully_connected = isinstance(model, LargeNNv2)

    with torch.no_grad():
        for _ in range(5):  # Warm-up Runs
            for x_test, _ in test_loader:
                if is_fully_connected:
                    x_test = x_test.view(x_test.size(0), -1).to(device)
                else:
                    x_test = x_test.to(device)
                model(x_test)

        for _ in range(10):  # Messungen
            for x_test, _ in test_loader:
                batch_size = x_test.size(0)
                if is_fully_connected:
                    x_test = x_test.view(batch_size, -1).to(device)
                else:
                    x_test = x_test.to(device)

                start.record()
                model(x_test)
                end.record()
                torch.cuda.synchronize()
                elapsed_time_per_batch = start.elapsed_time(end)  # Zeit in ms
                elapsed_time_per_sample = (
                    elapsed_time_per_batch / batch_size
                )  # Zeit pro Datenpunkt
                runtimes_per_sample.append(elapsed_time_per_sample * 1000) # milliseconds to microseconds

    avg_time_per_sample = sum(runtimes_per_sample) / len(runtimes_per_sample)
    return avg_time_per_sample, runtimes_per_sample

In [9]:
def measure_inference_time(model, test_loader):
    is_fully_connected = isinstance(model, LargeNNv2)
    runtimes = []

    with torch.no_grad():
        for x_test, Y in test_loader:
            if is_fully_connected:
                # Reshape (batch_size, 1, 7, 7) to (batch_size, 49) for LargeNNv2
                x_test = x_test.view(x_test.size(0), 1, -1)

            start_time = time.perf_counter_ns()
            model(x_test)
            total_time = time.perf_counter_ns() - start_time
            runtimes.append(total_time)

    avg_time = sum(runtimes) / len(test_loader.dataset)
    return avg_time

In [10]:
def measure_memory_usage(model, test_loader):
    model.eval()
    model.to(device)
    is_fully_connected = isinstance(model, LargeNNv2)
    
    peak_memory = torch.cuda.max_memory_allocated()  # GPU Peak Memory
    torch.cuda.reset_peak_memory_stats()

    with torch.no_grad():
        for x_test, _ in test_loader:
            if is_fully_connected:
                x_test = x_test.view(x_test.size(0), -1).to(device)
            else:
                x_test = x_test.to(device)
            model(x_test)

    peak_memory = torch.cuda.max_memory_allocated() / (1024**2)  # In MB
    torch.cuda.reset_peak_memory_stats()
    return peak_memory

In [11]:
def evaluate_found_path(answer: torch.Tensor, Y_test: torch.Tensor):
    start = find_gate(answer, 0.6, True)
    end = find_gate(answer, 0.6, False)

    if start is None or end is None:
        return False

    current = (start[0], start[1])
    visited = set()
    visited.add(current)

    max_moves = (
        answer.shape[0] * answer.shape[1]
    )  # Consider the grid size for max moves
    # max_moves = 17

    for i in range(max_moves):
        brightest_neighbour = find_brightest_neighbour(answer, current, visited, Y_test)
        if brightest_neighbour is None:
            return False
        current = (brightest_neighbour[0], brightest_neighbour[1])
        visited.add(current)
        if current == end:
            return True
    return False


def find_gate(answer: torch.Tensor, epsillon: float, start: bool):
    rows, cols = answer.shape  # Get the actual number of rows and columns
    for row in range(rows):
        if start:
            if answer[row][0] > (3 - epsillon):
                return row, 0
        else:
            if answer[row][cols - 1] > (
                3 - epsillon
            ):  # Use the last column dynamically
                return row, cols - 1
    return None


def find_brightest_neighbour(
    answer: torch.Tensor,
    position: tuple[int, int],
    visited: set[tuple[int, int]],
    Y_test: torch.Tensor,
):
    rows, cols = answer.shape  # Get the actual number of rows and columns

    ind_up = (max(position[0] - 1, 0), position[1])
    ind_down = (min(position[0] + 1, rows - 1), position[1])
    ind_left = (position[0], max(position[1] - 1, 0))
    ind_right = (position[0], min(position[1] + 1, cols - 1))

    value_up = answer[ind_up[0]][ind_up[1]] if ind_up not in visited else -1
    value_down = answer[ind_down[0]][ind_down[1]] if ind_down not in visited else -1
    value_left = answer[ind_left[0]][ind_left[1]] if ind_left not in visited else -1
    value_right = answer[ind_right[0]][ind_right[1]] if ind_right not in visited else -1

    # Find the maximum value among the neighbours and return the corresponding index
    max_value = max(value_up, value_down, value_left, value_right)
    if max_value == -1:
        return None

    if max_value == value_up and Y_test[ind_up[0]][ind_up[1]] != 1.0:
        return ind_up
    elif max_value == value_down and Y_test[ind_down[0]][ind_down[1]] != 1.0:
        return ind_down
    elif max_value == value_left and Y_test[ind_left[0]][ind_left[1]] != 1.0:
        return ind_left
    elif max_value == value_right and Y_test[ind_right[0]][ind_right[1]] != 1.0:
        return ind_right

    return None


def evaluate_path_accuracy(model, test_loader: DataLoader):
    preds_ = torch.zeros(len(test_loader.dataset), dtype=bool)

    # Check the model type or input requirements
    is_fully_connected = isinstance(model, LargeNNv2)
    x = next(iter(test_loader))[0]
    rows, cols = (
        x.shape[2],
        x.shape[3],
    )  # Assumes Y_hat has shape (batch_size, channels, rows, cols)

    output = []
    output_truth = []
    # i = 0
    with torch.no_grad():
        for x_test, Y in test_loader:
            if is_fully_connected:
                # Reshape (batch_size, 1, 7, 7) to (batch_size, 1, 49) for LargeNNv2
                x_test = x_test.view(x_test.size(0), 1, -1)

            # Pass the input through the model
            y_hat = model(x_test)
            output.append(y_hat)
            output_truth.append(Y)
            # i += 1
            # print(f"got {i} batches from model")

    output = torch.cat(output, dim=0).cpu()
    output_truth = torch.cat(output_truth, dim=0).cpu()

    for index, (y_hat, y_true) in enumerate(zip(output, output_truth)):
        # Reshape predictions and targets to 2D (7x7) for evaluation if needed
        if is_fully_connected:
            y_hat = y_hat.view(y_hat.size(0), 1, 7, 7)

        # Evaluate the path accuracy
        preds_[index] = evaluate_found_path(
            y_hat.view(rows, cols), y_true.view(rows, cols)
        )
        

    # Calculate accuracy
    total = preds_.shape[0]
    correct = preds_.sum().item()
    correct_percentage = (correct / total) * 100

    return correct_percentage

In [12]:
def get_model_summary(model):
    """
    Liefert Informationen über die Anzahl der Parameter und die Modellgröße.
    """
    param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    model_size = (
        param_count * 4 / (1024**2)
    )  # Float32 ist 4 Bytes groß, MB-Konvertierung
    summary(model)
    return param_count, model_size

In [13]:
def compare_models_extended(test_loader, *models):
    """
    Vergleicht Modelle basierend auf erweiterten Metriken.
    """
    results = []

    for idx, model in enumerate(models):
        print(f"Evaluating Model {model.__class__.__name__} {idx}")
        model_name = f"{model.__class__.__name__} {idx}"
        model.to(device)
        model.eval()

        # Path Accuracy
        accuracy = evaluate_path_accuracy(model, test_loader)
        # Inference Time per Sample
        avg_time_per_sample, _ = cuda_event_measure_per_sample(model, test_loader)
        # Memory Usage
        memory_usage = measure_memory_usage(model, test_loader)
        # Parameter Count & Model Size
        param_count, model_size = get_model_summary(model)

        # Ergebnisse sammeln
        results.append(
            {
                "Model": model_name,
                "Path Accuracy (%)": accuracy,
                "Inference Time (µs)": avg_time_per_sample,
                "Memory Usage (MB)": memory_usage,
                "Parameter Count": param_count,
                "Model Size (MB)": model_size,
            }
        )

    # Ergebnisse als DataFrame
    results_df = pd.DataFrame(results)
    print(results_df)
    return results_df

In [14]:
def generate_comparison_table(metrics_df):
    """
    Generiert eine Tabelle für den Vergleich der Modelle mit zusätzlichen Metriken.
    """
    # Tabelle erstellen
    table = tabulate(metrics_df, headers="keys", tablefmt="grid", showindex=False)
    return table

In [15]:
# print(summary(fcnn.to(device), (64, 1, 1, 49)))

In [16]:
# Beispiel:
metrics_df = compare_models_extended(test_loader, fcae, fcnn)

Evaluating Model Level3FullyConvDropoutNet 0
Evaluating Model LargeNNv2 1
                         Model  Path Accuracy (%)  Inference Time (µs)  \
0  Level3FullyConvDropoutNet 0          97.211890             2.371730   
1                  LargeNNv2 1          97.513909             1.989789   

   Memory Usage (MB)  Parameter Count  Model Size (MB)  
0          47.915039            21340         0.081406  
1          37.049805          1519074         5.794807  


In [None]:
# Generiere und drucke die Tabelle
comparison_table = generate_comparison_table(metrics_df)
print(comparison_table)

In [18]:
# create dataset
# test_data = TensorDataset(X_test_tensor, Y_test_tensor)

# batchsize = 64
# batchsize = test_data.tensors[0].shape[0]
# test_loader = DataLoader(test_data, batch_size=batchsize, shuffle=True, drop_last=True)

In [19]:
# import torch.profiler

In [None]:
device = "cuda"
# device = "cpu"

x_test = X_test_tensor.detach().clone()
x_test = x_test.to(device)

# model = fcnn.eval().to(device)
model = fcae.eval().to(device)
if isinstance(model, LargeNNv2):
    x_test = x_test.view(x_test.size(0), 1, -1)

with torch.no_grad():
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA,
        ],
        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler("./log"),
        # record_shapes=True,
        # with_stack=True,
        profile_memory=True,
        with_flops=True,
        # with_modules=True,
    ) as prof:
        model(x_test)
        # with torch.profiler.record_function("model_inference"):

print(
    prof.key_averages(group_by_input_shape=True).table(
        sort_by="cuda_time_total",
        row_limit=-1,  # Erhöhe die Anzahl der angezeigten Zeilen
    )
)