## 1. Setup and Model Loading

Import necessary libraries, load the trained FlowTransformer model and its configuration, and prepare sample input data for profiling.

In [1]:
# Import required libraries for profiling and model loading
import os
import json
import numpy as np
import tensorflow as tf
import time
import pandas as pd
from memory_profiler import memory_usage

# Import custom components (adjust as needed for your repo structure)
from framework.flow_transformer import FlowTransformer
from implementations.transformers.basic_transformers import TransformerEncoderBlock

# List available models
models_dir = "saved_models"
model_files = [f for f in os.listdir(models_dir) if f.endswith(".keras")]
if not model_files:
    raise FileNotFoundError("No saved models found in the 'saved_models' directory.")

print("Available models:")
for i, model_file in enumerate(model_files):
    print(f"  {i}: {model_file}")

# Select the first model for analysis
selected_model_index = 0
selected_model_file = model_files[selected_model_index]
model_name = os.path.splitext(selected_model_file)[0]
model_path = os.path.join(models_dir, selected_model_file)
config_path = os.path.join(models_dir, f"{model_name}_config.json")

# Load model configuration
with open(config_path, 'r') as f:
    config = json.load(f)

print(f"\nSelected model: {model_name}")
print("Model Configuration:")
print(json.dumps(config, indent=2))

# Load the Keras model
print("Loading model...")
model = tf.keras.models.load_model(
    model_path,
    custom_objects={'TransformerEncoderBlock': TransformerEncoderBlock},
    safe_mode=False
)
print("Model loaded successfully.")
model.summary()

# Prepare sample input data for profiling
# Use the same input shape as the model expects (adjust as needed)
input_shape = model.input_shape
batch_size = config['training_config']['batch_size'] if 'training_config' in config else 32
sample_input = [np.random.rand(batch_size, *shape[1:]).astype(np.float32) for shape in input_shape] if isinstance(input_shape, list) else np.random.rand(batch_size, *input_shape[1:]).astype(np.float32)
print(f"Sample input shape: {input_shape}")

2025-07-22 16:42:09.134322: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753202529.146421 1240450 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753202529.149917 1240450 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-22 16:42:09.163746: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Available models:
  0: FlowTransformer_BERT_CSE_CIC_IDS_ws8_bs128_20250722_143415.keras

Selected model: FlowTransformer_BERT_CSE_CIC_IDS_ws8_bs128_20250722_143415
Model Configuration:
{
  "model_name": "FlowTransformer_BERT_CSE_CIC_IDS_ws8_bs128_20250722_143415",
  "timestamp": "20250722_143415",
  "model_format": "native_keras",
  "dataset": {
    "name": "CSE_CIC_IDS",
    "path": "/home/joeldan/dvcon_model/FlowTransformer_Pytorch_Imp/datasets.csv",
    "eval_percent": 0.01,
    "eval_method": "LastRows"
  },
  "model_config": {
    "input_encoding": "NoInputEncoder",
    "sequential_model": "BasicTransformer",
    "classification_head": "LastTokenClassificationHead",
    "window_size": 8,
    "mlp_layer_sizes": [
      128
    ],
    "mlp_dropout": 0.1
  },
  "training_config": {
    "batch_size": 128,
    "epochs": 5,
    "steps_per_epoch": 64,
    "early_stopping_patience": 5,
    "final_epoch": 4
  },
  "optimizer": "adam",
  "loss": "binary_crossentropy",
  "metrics": [
    "bi

I0000 00:00:1753202531.217656 1240450 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Model loaded successfully.


Sample input shape: [(None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 1), (None, 8, 32), (None, 8, 32), (None, 8, 32), (None, 8, 32), (None, 8, 32), (None, 8, 5), (None, 8, 32), (None, 8, 32), (None, 8, 32)]


## 2. Define Profiling Utilities

Create helper functions to measure execution time accurately, including warm-up iterations and GPU synchronization if applicable.

In [2]:
# Profiling utilities
import gc

def time_function(func, *args, warmup=5, repeat=10, **kwargs):
    """
    Times the execution of a function, with warm-up iterations and GPU synchronization.
    Returns average time in seconds.
    """
    # Warm-up
    for _ in range(warmup):
        func(*args, **kwargs)
    gc.collect()
    if tf.config.list_physical_devices('GPU'):
        tf.keras.backend.clear_session()
        tf.config.experimental.reset_memory_stats('GPU:0')
    times = []
    for _ in range(repeat):
        start = time.time()
        func(*args, **kwargs)
        # GPU synchronization is handled automatically in TensorFlow 2.x
        end = time.time()
        times.append(end - start)
    avg_time = sum(times) / len(times)
    print(f"Average time over {repeat} runs: {avg_time:.6f} seconds")
    return avg_time

# Example usage:
# avg = time_function(model.predict, sample_input)

## 3. Profile End-to-End MHA Latency

Isolate a single TransformerEncoderBlock from the loaded model and measure the average forward pass time for the Multi-Head Attention block using sample input data.

In [3]:
# Profile MHA and all layers using TensorFlow Profiler during a real inference pass
import tensorflow as tf
import datetime

logdir = "./tf_profiler_logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tf.profiler.experimental.start(logdir)

print(f"Profiling model inference. TensorBoard logs will be saved to: {logdir}")

# Use a real batch for profiling (from previous cell, or fallback to sample_input)
profile_input = sample_input
try:
    if 'eval_X' in locals():
        profile_input = eval_X
except Exception:
    pass

model.predict(profile_input)
tf.profiler.experimental.stop()
print(f"Profiling complete. To view per-layer timings (including MHA), run: tensorboard --logdir {logdir}")

Profiling model inference. TensorBoard logs will be saved to: ./tf_profiler_logs/20250722-164212


2025-07-22 16:42:12.398681: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:103] Profiler session initializing.
2025-07-22 16:42:12.398739: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:118] Profiler session started.
2025-07-22 16:42:12.398821: I external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1006] Profiler found 1 GPUs
I0000 00:00:1753202532.770027 1240539 service.cc:148] XLA service 0x2d54ba20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753202532.770080 1240539 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2025-07-22 16:42:12.787308: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1753202532.834329 1240539 cuda_dnn.cc:529] Loaded cuDNN version 90101
I0000 00:00:1753202532.770027 1240539 service.cc:148] XLA service 0x

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step
Profiling complete. To view per-layer timings (including MHA), run: tensorboard --logdir ./tf_profiler_logs/20250722-164212
Profiling complete. To view per-layer timings (including MHA), run: tensorboard --logdir ./tf_profiler_logs/20250722-164212


I0000 00:00:1753202536.323617 1240539 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-07-22 16:42:16.374208: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:68] Profiler session collecting data.
2025-07-22 16:42:16.378249: I external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1213] CUPTI activity buffer flushed
2025-07-22 16:42:16.393290: I external/local_xla/xla/backends/profiler/gpu/cupti_collector.cc:635]  GpuTracer has collected 1970 callback api events and 1460 activity events. 
2025-07-22 16:42:16.393359: I external/local_xla/xla/backends/profiler/gpu/cupti_collector.cc:638]  GpuTracer max callback_events: 2097152, max activity events: 2097152
2025-07-22 16:42:16.400000: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:130] Profiler session tear down.
2025-07-22 16:42:16.402211: I external/local_xla/xla/tsl/profiler/rpc/client/save_profile.cc:147] Collecting XSpace to repository

## 4. Profile Total Inference Time and Throughput

Run inference on the full model using a batch of sample data. Measure the average time per batch to find the total inference time. Calculate throughput as (batch_size / inference_time).

In [4]:
# Profile total inference time for the full model
inference_time = time_function(model.predict, sample_input)
print(f"Average total inference time per batch: {inference_time:.6f} seconds")

# Calculate throughput (samples per second)
throughput = batch_size / inference_time
print(f"Throughput: {throughput:.2f} samples/second")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [5]:
# --- Extract a real batch from the dataset and profile the MHA block ---
# Assumes you have loaded or can load the dataset as in your analysis notebook
try:
    # Load dataset using FlowTransformer (reuse logic from your analysis notebook)
    from framework.dataset_specification import NamedDatasetSpecifications
    from framework.enumerations import EvaluationDatasetSampling
    from framework.flow_transformer_parameters import FlowTransformerParameters
    from implementations.classification_heads import *
    from implementations.input_encodings import *
    from implementations.pre_processings import StandardPreProcessing
    from implementations.transformers.basic_transformers import *
    from implementations.transformers.named_transformers import *

    # Map component names from config to actual classes
    all_components = {
        "pre_processing": {
            "StandardPreProcessing": StandardPreProcessing(n_categorical_levels=32)
        },
        "input_encoding": {
            "NoInputEncoder": NoInputEncoder(),
            "RecordLevelEmbed": RecordLevelEmbed(64),
            "CategoricalFeatureEmbed": CategoricalFeatureEmbed(EmbedLayerType.Dense, 16),
        },
        "sequential_model": {
            "BasicTransformer": BasicTransformer(2, 128, n_heads=2),
            "GPTSmallTransformer": GPTSmallTransformer(),
            "BERTSmallTransformer": BERTSmallTransformer()
        },
        "classification_head": {
            "LastTokenClassificationHead": LastTokenClassificationHead(),
            "FlattenClassificationHead": FlattenClassificationHead(),
            "GlobalAveragePoolingClassificationHead": GlobalAveragePoolingClassificationHead(),
            "CLSTokenClassificationHead": CLSTokenClassificationHead(),
            "FeaturewiseEmbedding": FeaturewiseEmbedding(project=False),
        }
    }
    model_config = config['model_config']
    ft = FlowTransformer(
        pre_processing=all_components["pre_processing"]["StandardPreProcessing"],
        input_encoding=all_components["input_encoding"][model_config['input_encoding']],
        sequential_model=all_components["sequential_model"][model_config['sequential_model']],
        classification_head=all_components["classification_head"][model_config['classification_head']],
        params=FlowTransformerParameters(
            window_size=model_config['window_size'],
            mlp_layer_sizes=model_config['mlp_layer_sizes'],
            mlp_dropout=model_config['mlp_dropout']
        )
    )
    dataset_config = config['dataset']
    dataset_spec_map = {
        "unified_flow_format": NamedDatasetSpecifications.unified_flow_format,
        "nsl_kdd": NamedDatasetSpecifications.nsl_kdd,
        "CSE_CIC_IDS": NamedDatasetSpecifications.unified_flow_format
    }
    ft.load_dataset(
        dataset_config['name'],
        "datasets.csv",
        dataset_spec_map[dataset_config['name']],
        evaluation_dataset_sampling=EvaluationDatasetSampling[dataset_config['eval_method']],
        evaluation_percent=dataset_config['eval_percent']
    )
    print("Dataset loaded.")

    # Prepare a real batch for profiling
    from framework.enumerations import CategoricalFormat
    selectable_mask = np.zeros(len(ft.X), dtype=bool)
    selectable_mask[ft.parameters.window_size:-ft.parameters.window_size] = True
    train_mask = ft.training_mask
    indices_test = np.argwhere(~train_mask & selectable_mask).reshape(-1)
    def get_windows_for_indices(indices):
        X_windows = []
        for i1 in indices:
            X_windows.append(ft.X.iloc[(i1 - ft.parameters.window_size) + 1:i1 + 1])
        return X_windows
    feature_columns_map = {}
    def samplewise_to_featurewise(X_windows):
        sequence_length = len(X_windows[0])
        combined_df = pd.concat(X_windows)
        featurewise_X = []
        if len(feature_columns_map) == 0:
            for feature in ft.model_input_spec.feature_names:
                if feature in ft.model_input_spec.numeric_feature_names or ft.model_input_spec.categorical_format == CategoricalFormat.Integers:
                    feature_columns_map[feature] = feature
                else:
                    feature_columns_map[feature] = [c for c in X_windows[0].columns if str(c).startswith(feature)]
        for feature in ft.model_input_spec.feature_names:
            feature_columns = feature_columns_map[feature]
            combined_values = combined_df[feature_columns].values
            reshaped_values = np.array([combined_values[i:i+sequence_length] for i in range(0, len(combined_values), sequence_length)])
            featurewise_X.append(reshaped_values)
        return featurewise_X
    eval_X_windows = get_windows_for_indices(indices_test[:batch_size])
    eval_X = samplewise_to_featurewise(eval_X_windows)
    print(f"Prepared real batch for profiling: {len(eval_X)} features, batch size {batch_size}")

    # Pass through the model up to the MHA block
    # Find the index of the MHA block
    mha_index = None
    for idx, layer in enumerate(model.layers):
        if isinstance(layer, TransformerEncoderBlock):
            mha_index = idx
            break
    if mha_index is None:
        raise ValueError("No TransformerEncoderBlock found in the model.")
    # Create a sub-model up to the MHA block
    from tensorflow.keras import Model
    sub_model = Model(inputs=model.inputs, outputs=model.layers[mha_index].output)
    mha_input = sub_model.predict(eval_X)
    print(f"Input shape for MHA block: {mha_input.shape}")

    # Profile the MHA block with correct input
    mha_latency = time_function(mha_block, mha_input)
    print(f"Average latency per matmul (MHA block, real data): {mha_latency:.6f} seconds")
except Exception as e:
    print(f"Error during MHA profiling with real data: {e}")

Using cache file path: cache/CSE_CIC_IDS_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_VHNk9ujbqtTXGSrgVayeqG486IQ0.feather
Reading directly from cache cache/CSE_CIC_IDS_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_VHNk9ujbqtTXGSrgVayeqG486IQ0.feather...
Dataset loaded.
Dataset loaded.
Prepared real batch for profiling: 37 features, batch size 128
Error during MHA profiling with real data: No module named 'tensorflow.keras'
Prepared real batch for profiling: 37 features, batch size 128
Error during MHA profiling with real data: No module named 'tensorflow.keras'


In [None]:
# --- Improved TensorFlow Profiler usage: warm-up and multiple steps ---
import tensorflow as tf
import datetime

logdir = "./tf_profiler_logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
print(f"Profiling model inference. TensorBoard logs will be saved to: {logdir}")

# Use a real batch for profiling (from previous cell, or fallback to sample_input)
profile_input = sample_input
try:
    if 'eval_X' in locals():
        profile_input = eval_X
except Exception:
    pass

# Debug: print input type and shape
print("profile_input type:", type(profile_input))
if isinstance(profile_input, list):
    for i, arr in enumerate(profile_input):
        print(f"Input {i} shape: {arr.shape}, dtype: {arr.dtype}")
else:
    print("Input shape:", profile_input.shape, "dtype:", profile_input.dtype)

# Warm-up (not profiled)
for _ in range(5):
    _ = model(profile_input)

tf.profiler.experimental.start(logdir)
for _ in range(10):
    _ = model(profile_input)
tf.profiler.experimental.stop()
print(f"Profiling complete. To view per-layer timings, run: tensorboard --logdir {logdir}")

Profiling model inference. TensorBoard logs will be saved to: ./tf_profiler_logs/20250722-164219
profile_input type: <class 'list'>
Input 0 shape: (128, 8), dtype: float32
Input 1 shape: (128, 8), dtype: float32
Input 2 shape: (128, 8), dtype: float32
Input 3 shape: (128, 8), dtype: float32
Input 4 shape: (128, 8), dtype: float32
Input 5 shape: (128, 8), dtype: float32
Input 6 shape: (128, 8), dtype: float32
Input 7 shape: (128, 8), dtype: float32
Input 8 shape: (128, 8), dtype: float32
Input 9 shape: (128, 8), dtype: float32
Input 10 shape: (128, 8), dtype: float32
Input 11 shape: (128, 8), dtype: float32
Input 12 shape: (128, 8), dtype: float32
Input 13 shape: (128, 8), dtype: float32
Input 14 shape: (128, 8), dtype: float32
Input 15 shape: (128, 8), dtype: float32
Input 16 shape: (128, 8), dtype: float32
Input 17 shape: (128, 8), dtype: float32
Input 18 shape: (128, 8), dtype: float32
Input 19 shape: (128, 8), dtype: float32
Input 20 shape: (128, 8), dtype: float32
Input 21 shape: (

2025-07-22 16:42:19.603539: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:103] Profiler session initializing.
2025-07-22 16:42:19.603606: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:118] Profiler session started.


Profiling complete. To view per-layer timings, run: tensorboard --logdir ./tf_profiler_logs/20250722-164219


2025-07-22 16:42:20.242797: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:68] Profiler session collecting data.
2025-07-22 16:42:20.246383: I external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1213] CUPTI activity buffer flushed
2025-07-22 16:42:20.255341: I external/local_xla/xla/backends/profiler/gpu/cupti_collector.cc:635]  GpuTracer has collected 148 callback api events and 170 activity events. 
2025-07-22 16:42:20.255403: I external/local_xla/xla/backends/profiler/gpu/cupti_collector.cc:638]  GpuTracer max callback_events: 2097152, max activity events: 2097152
2025-07-22 16:42:20.257606: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:130] Profiler session tear down.
2025-07-22 16:42:20.259070: I external/local_xla/xla/tsl/profiler/rpc/client/save_profile.cc:147] Collecting XSpace to repository: ./tf_profiler_logs/20250722-164219/plugins/profile/2025_07_22_16_42_20/joels-loq.xplane.pb
