# <font color="#418FDE" size="6.5" uppercase>**Profiling and Tuning**</font>

>Last update: 20260130.
    
By the end of this Lecture, you will be able to:
- Profile PyTorch models using torch.profiler to identify time‑consuming operations. 
- Apply basic performance optimizations such as adjusting batch size, using pin_memory, and enabling mixed precision. 
- Evaluate the trade‑offs between speed, memory usage, and numerical stability when tuning models. 


## **1. PyTorch Profiling Tools**

### **1.1. torch profiler essentials**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_01_01.jpg?v=1769763494" width="250">



>* Profiler logs detailed timing for every operation
>* Helps pinpoint hidden bottlenecks and slow layers

>* Profile a representative loop with warmup and steady-state
>* Inspect timelines to spot idle GPUs and bottlenecks

>* Profiler unifies CPU and GPU activity timelines
>* Helps diagnose bottlenecks and meet performance constraints



In [None]:
#@title Python Code - torch profiler essentials

# This script shows basic PyTorch profiling essentials.
# It compares unoptimized and optimized training steps.
# Focus on simple timing and profiler table output.

# !pip install torch torchvision.

# Import required standard libraries.
import os
import random
import time

# Import torch and torchvision modules.
import torch
import torch.nn as nn
import torch.optim as optim

# Import torchvision datasets and transforms.
import torchvision
import torchvision.transforms as transforms

# Import the PyTorch profiler utilities.
from torch.profiler import profile, record_function, ProfilerActivity

# Set deterministic random seeds for reproducibility.
random.seed(0)

# Set numpy like seed using torch manual seed.
torch.manual_seed(0)

# Select device based on GPU availability.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the PyTorch version and selected device.
print("PyTorch version:", torch.__version__, "Device:", device)

# Define a simple convolutional neural network model.
class SmallCNN(nn.Module):
    # Initialize layers inside the constructor.
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(8, 10)

    # Define the forward computation graph.
    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Create a small MNIST training subset transform.
transform = transforms.Compose([transforms.ToTensor()])

# Download MNIST dataset with training split only.
train_dataset = torchvision.datasets.MNIST(
    root="./data", train=True, download=True, transform=transform
)

# Use only a small subset of the dataset.
subset_size = 256

# Create indices and subset for faster execution.
indices = list(range(subset_size))
train_subset = torch.utils.data.Subset(train_dataset, indices)

# Create a DataLoader without optimizations first.
loader_basic = torch.utils.data.DataLoader(
    train_subset, batch_size=64, shuffle=True, num_workers=0
)

# Create a DataLoader with pin_memory optimization.
loader_optimized = torch.utils.data.DataLoader(
    train_subset,
    batch_size=64,
    shuffle=True,
    num_workers=0,
    pin_memory=torch.cuda.is_available(),
)

# Helper function to run one training step.
def train_step(model, data_iter, optimizer, use_amp=False):
    model.train()
    criterion = nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    try:
        images, labels = next(data_iter)
    except StopIteration:
        return False
    if images.ndim != 4 or labels.ndim != 1:
        raise ValueError("Unexpected batch shapes in train_step.")
    images = images.to(device, non_blocking=True)
    labels = labels.to(device, non_blocking=True)
    optimizer.zero_grad(set_to_none=True)
    with torch.cuda.amp.autocast(enabled=use_amp):
        outputs = model(images)
        loss = criterion(outputs, labels)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    return True

# Function to profile a single training step.
def profile_one_step(data_loader, use_amp=False, label="basic"):
    model = SmallCNN().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    data_iter = iter(data_loader)
    for _ in range(2):
        ok = train_step(model, data_iter, optimizer, use_amp)
        if not ok:
            return
    activities = [ProfilerActivity.CPU]
    if torch.cuda.is_available():
        activities.append(ProfilerActivity.CUDA)
    with profile(activities=activities, record_shapes=True) as prof:
        with record_function(f"train_step_{label}"):
            train_step(model, data_iter, optimizer, use_amp)
    key_averages = prof.key_averages().table(
        sort_by="self_cpu_time_total", row_limit=8
    )
    return key_averages

# Profile the basic DataLoader without mixed precision.
start_basic = time.time()
prof_basic = profile_one_step(loader_basic, use_amp=False, label="basic")
end_basic = time.time()

# Profile the optimized DataLoader with mixed precision.
start_opt = time.time()
prof_opt = profile_one_step(loader_optimized, use_amp=True, label="optimized")
end_opt = time.time()

# Print short timing comparison for both configurations.
print("Basic step wall time seconds:", round(end_basic - start_basic, 4))

# Print optimized configuration timing for comparison.
print("Optimized step wall time seconds:", round(end_opt - start_opt, 4))

# Print a small header for profiler summaries.
print("\nTop operations from basic configuration:")

# Print the profiler table for the basic configuration.
print(prof_basic)

# Print a small header for optimized configuration.
print("\nTop operations from optimized configuration:")

# Print the profiler table for the optimized configuration.
print(prof_opt)




### **1.2. Chrome Trace Viewer**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_01_02.jpg?v=1769763655" width="250">



>* Visual timeline shows operations across CPU and GPU
>* Helps spot bottlenecks and overlaps in training

>* Tracks show threads, GPU streams, and events
>* Helps spot runtime bottlenecks like slow data

>* Compare traces to see optimization effects clearly
>* Build intuition, locate bottlenecks, validate performance changes



In [None]:
#@title Python Code - Chrome Trace Viewer

# This script shows a tiny profiling example.
# We simulate a trace style workflow for beginners.
# Focus is on Chrome Trace style profiler output.

# Install PyTorch if not already available in Colab.
# !pip install torch torchvision torchaudio --quiet.

# Import standard libraries for system checks.
import os
import sys
import time

# Try importing torch and handle missing installation.
try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.profiler import profile
    from torch.profiler import ProfilerActivity
except Exception as e:
    print("PyTorch import failed, please install first.")
    raise e

# Set a deterministic random seed for reproducibility.
torch.manual_seed(0)

# Select device, prefer cuda if available and supported.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print a short line with PyTorch version and device.
print("Torch version:", torch.__version__, "Device:", device)

# Define a tiny convolutional network for demonstration.
class TinyConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3)
        self.fc1 = nn.Linear(16 * 24 * 24, 32)
        self.fc2 = nn.Linear(32, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Create a small random input batch similar to MNIST.
batch_size = 8
input_tensor = torch.randn(batch_size, 1, 28, 28)

# Validate the input shape before moving to device.
assert input_tensor.shape == (8, 1, 28, 28)

# Move model and data to the selected device.
model = TinyConvNet().to(device)
input_tensor = input_tensor.to(device)

# Warm up the model once to avoid cold start noise.
with torch.no_grad():
    _ = model(input_tensor)

# Define a simple training like step for profiling.
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Create tiny target labels for the fake batch.
target = torch.randint(0, 10, (batch_size,), device=device)

# Ensure target shape matches batch dimension exactly.
assert target.shape[0] == input_tensor.shape[0]

# Set up profiler to record CPU and CUDA activities.
activities = [ProfilerActivity.CPU]
if device.type == "cuda":
    activities.append(ProfilerActivity.CUDA)

# Define a small helper function to run one step.
def train_step():
    model.train()
    optimizer.zero_grad()
    output = model(input_tensor)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

# Use torch.profiler to record a few training steps.
with profile(activities=activities, record_shapes=True) as prof:
    for _ in range(3):
        train_step()

# Choose a directory for saving the Chrome trace file.
trace_dir = "./profiler_traces"

# Create the directory if it does not already exist.
os.makedirs(trace_dir, exist_ok=True)

# Export the trace as a Chrome Trace JSON file.
trace_path = os.path.join(trace_dir, "tiny_trace.json")
prof.export_chrome_trace(trace_path)

# Print short instructions for opening the trace file.
print("Trace saved to:", trace_path)
print("To view, open chrome://tracing in Chrome.")
print("Then load tiny_trace.json to explore the timeline.")
print("Look for CPU and CUDA tracks and kernel durations.")
print("Use zoom and pan to inspect slow operations visually.")



### **1.3. Reading Operator Metrics**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_01_03.jpg?v=1769763977" width="250">



>* Profiler rows show each operation and timings
>* Find operators dominating runtime to target optimization

>* Compare self time and total time meanings
>* Use them to locate true performance bottlenecks

>* Watch CPU, GPU time and data transfers
>* Use patterns to choose effective performance optimizations



In [None]:
#@title Python Code - Reading Operator Metrics

# This script shows basic operator profiling.
# It uses TensorFlow to mimic profiling ideas.
# Focus on reading timing metrics for operations.

# !pip install tensorflow-io-gcs-filesystem.

# Import required standard libraries.
import os
import time
import random

# Import TensorFlow and check version.
import tensorflow as tf

# Set deterministic seeds for reproducibility.
seed_value = 42
random.seed(seed_value)

# Set TensorFlow random seed for determinism.
tf.random.set_seed(seed_value)

# Print TensorFlow version in one short line.
print("TensorFlow version:", tf.__version__)

# Select device string based on availability.
physical_gpus = tf.config.list_physical_devices("GPU")

# Choose GPU if available otherwise use CPU.
if physical_gpus:
    device_name = "/GPU:0"
else:
    device_name = "/CPU:0"

# Print which device will run the operations.
print("Using device:", device_name)

# Define a helper to time a TensorFlow function.
def time_tf_function(fn, *args, **kwargs):
    start_time = time.perf_counter()

    # Run the function once to get result.
    result = fn(*args, **kwargs)

    # Ensure all pending ops are finished.
    if hasattr(tf.experimental, "sync_devices"):
        tf.experimental.sync_devices()

    # Compute elapsed time in milliseconds.
    elapsed_ms = (time.perf_counter() - start_time) * 1000.0

    # Return both result and elapsed milliseconds.
    return result, elapsed_ms

# Create a simple dense layer model for profiling.
inputs = tf.keras.Input(shape=(512,), name="features")

# Add two dense layers to create some work.
x = tf.keras.layers.Dense(256, activation="relu")(inputs)

# Add another dense layer with relu activation.
x = tf.keras.layers.Dense(256, activation="relu")(x)

# Final output layer with ten units softmax.
outputs = tf.keras.layers.Dense(10, activation="softmax")(x)

# Build the Keras model object.
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Create a small random batch of input data.
batch_size = 64

# Generate random input tensor with correct shape.
input_data = tf.random.normal(shape=(batch_size, 512))

# Validate the input tensor shape explicitly.
assert input_data.shape == (batch_size, 512)

# Warm up the model once to build graphs.
_ = model(input_data, training=False)

# Define a function that runs the full forward pass.
@tf.function
def run_full_model(x):
    return model(x, training=False)

# Define a function that runs only first dense layer.
@tf.function
def run_first_layer(x):
    return model.layers[1](x)

# Define a function that runs only second dense layer.
@tf.function
def run_second_layer(x):
    first = model.layers[1](x)
    return model.layers[2](first)

# Time the full model forward pass once.
_, full_time_ms = time_tf_function(run_full_model, input_data)

# Time the first dense layer forward pass.
_, first_time_ms = time_tf_function(run_first_layer, input_data)

# Time the second dense layer forward pass.
_, second_time_ms = time_tf_function(run_second_layer, input_data)

# Print a short header for timing results.
print("\nOperator like timing results (milliseconds):")

# Print timing for the full model call.
print("Full model total time:", round(full_time_ms, 3))

# Print timing for first dense layer only.
print("First dense self like time:", round(first_time_ms, 3))

# Print timing for second dense layer only.
print("Second dense self like time:", round(second_time_ms, 3))

# Compute approximate fraction of time per layer.
first_fraction = first_time_ms / full_time_ms

# Compute second layer fraction of total time.
second_fraction = second_time_ms / full_time_ms

# Print fractions to mimic operator contribution.
print("First layer fraction of total:", round(first_fraction, 3))

# Print second layer fraction of total time.
print("Second layer fraction of total:", round(second_fraction, 3))

# Show which layer appears more time consuming.
print("Heavier layer index:", 1 if first_time_ms > second_time_ms else 2)



## **2. Boosting Data Throughput**

### **2.1. Optimal Batch Sizing**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_02_01.jpg?v=1769764222" width="250">



>* Balance GPU usage without overloading its memory
>* Tune batch size to maximize speed safely

>* Increase batch size slowly while tracking throughput
>* Stop increasing when gains plateau or memory fails

>* Batch size interacts with optimization and stability
>* Retune batch size whenever settings or hardware change



In [None]:
#@title Python Code - Optimal Batch Sizing

# This script explores optimal batch sizing simply.
# We use TensorFlow to simulate training speed.
# Focus on throughput changes with different batch sizes.

# !pip install tensorflow==2.20.0.

# Import required standard libraries.
import os
import time
import random

# Import TensorFlow and NumPy.
import tensorflow as tf
import numpy as np

# Set deterministic random seeds.
seed_value = 42
random.seed(seed_value)

# Set NumPy and TensorFlow seeds.
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Print TensorFlow version briefly.
print("TensorFlow version:", tf.__version__)

# Select device string based on availability.
physical_gpus = tf.config.list_physical_devices("GPU")
if physical_gpus:
    device_name = "/GPU:0"
else:
    device_name = "/CPU:0"

# Inform which device is used.
print("Using device:", device_name)

# Define small synthetic dataset size.
num_samples = 4096
input_dim = 128

# Create random input features.
features = np.random.randn(num_samples, input_dim).astype("float32")

# Create random binary labels.
labels = np.random.randint(0, 2, size=(num_samples, 1)).astype("float32")

# Validate shapes before building dataset.
assert features.shape[0] == labels.shape[0]

# Build a simple dense model.
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_dim,)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

# Compile model with basic optimizer.
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

# Prepare list of batch sizes to test.
batch_sizes = [16, 64, 256, 512]

# Dictionary to store throughput results.
throughput_results = {}

# Use device scope for potential GPU.
with tf.device(device_name):

    # Loop over candidate batch sizes.
    for batch_size in batch_sizes:

        # Build tf.data dataset with given batch size.
        dataset = tf.data.Dataset.from_tensor_slices((features, labels))
        dataset = dataset.shuffle(buffer_size=num_samples, seed=seed_value)

        # Batch and prefetch for better throughput.
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

        # Warm up one small training step.
        _ = model.fit(
            dataset.take(1),
            epochs=1,
            verbose=0,
        )

        # Time one short epoch over dataset.
        start_time = time.time()
        history = model.fit(
            dataset,
            epochs=1,
            verbose=0,
        )
        end_time = time.time()

        # Compute elapsed time and throughput.
        elapsed = end_time - start_time
        samples_per_second = num_samples / max(elapsed, 1e-6)

        # Store throughput for this batch size.
        throughput_results[batch_size] = samples_per_second

# Print concise summary header.
print("\nBatch size vs samples per second:")

# Print results for each tested batch size.
for bs in batch_sizes:
    value = throughput_results[bs]
    print(f"Batch {bs:4d}: {value:8.1f} samples/sec")

# Print simple guidance based on best throughput.
best_bs = max(throughput_results, key=throughput_results.get)
print("\nBest throughput batch size in this demo:", best_bs)



### **2.2. Efficient DataLoader Configuration**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_02_02.jpg?v=1769764492" width="250">



>* Slow data loaders starve GPUs and hurt throughput
>* Treat data loader configuration as key optimization lever

>* More DataLoader workers prepare batches in parallel
>* Tune worker count to balance speed and resources

>* Shuffle and batch data to balance randomness, speed
>* Use smart prefetching to keep accelerators busy



In [None]:
#@title Python Code - Efficient DataLoader Configuration

# This script shows efficient DataLoader configuration.
# We compare slow and fast data input pipelines.
# Focus is on batch size and prefetch settings.

# !pip install tensorflow==2.20.0.

# Import required standard libraries.
import os
import random
import numpy as np

# Import tensorflow and check version.
import tensorflow as tf

# Set deterministic random seeds.
seed_value = 42
random.seed(seed_value)

# Set numpy random seed for reproducibility.
np.random.seed(seed_value)

# Set tensorflow random seed for reproducibility.
tf.random.set_seed(seed_value)

# Detect available device type for information.
physical_gpus = tf.config.list_physical_devices("GPU")

# Print framework version and device information.
print("TensorFlow version:", tf.__version__)

# Print whether a GPU is available or not.
print("GPU available:", bool(physical_gpus))

# Create a small synthetic dataset in memory.
num_samples = 2000

# Define feature dimension for synthetic data.
feature_dim = 32

# Generate random input features as float32.
features = np.random.randn(num_samples, feature_dim).astype("float32")

# Generate simple binary labels from features.
labels = (np.sum(features, axis=1) > 0).astype("int32")

# Validate shapes before building datasets.
assert features.shape[0] == labels.shape[0]

# Create a base tf.data.Dataset from tensors.
base_ds = tf.data.Dataset.from_tensor_slices((features, labels))

# Shuffle the dataset with a small buffer.
base_ds = base_ds.shuffle(buffer_size=512, seed=seed_value)

# Define a simple function to build a dataset.
def make_dataset(batch_size, prefetch_size, num_parallel_calls):
    # Batch the dataset with given batch size.
    ds = base_ds.batch(batch_size)

    # Map a light preprocessing step.
    ds = ds.map(lambda x, y: (tf.math.l2_normalize(x, axis=1), y),
                num_parallel_calls=num_parallel_calls)

    # Prefetch batches to overlap compute and input.
    ds = ds.prefetch(prefetch_size)

    # Return the configured dataset.
    return ds

# Build a deliberately slow input pipeline.
slow_batch_size = 16

# Use no parallel calls and minimal prefetch.
slow_ds = make_dataset(slow_batch_size, prefetch_size=1,
                       num_parallel_calls=None)

# Build a more efficient input pipeline.
fast_batch_size = 64

# Use autotune for parallel calls and prefetch.
fast_ds = make_dataset(fast_batch_size,
                       prefetch_size=tf.data.AUTOTUNE,
                       num_parallel_calls=tf.data.AUTOTUNE)

# Define a simple dense model for demonstration.
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(feature_dim,)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

# Compile the model with binary crossentropy loss.
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

# Train briefly with the slow input pipeline.
print("Training with slow DataLoader configuration...")

# Use small epochs and silent verbose setting.
history_slow = model.fit(slow_ds,
                         epochs=2,
                         verbose=0)

# Train briefly with the fast input pipeline.
print("Training with efficient DataLoader configuration...")

# Reuse same model weights for fair comparison.
history_fast = model.fit(fast_ds,
                         epochs=2,
                         verbose=0)

# Extract final losses and accuracies for comparison.
slow_loss = history_slow.history["loss"][-1]

# Get final accuracy from slow configuration.
slow_acc = history_slow.history["accuracy"][-1]

# Extract final metrics from fast configuration.
fast_loss = history_fast.history["loss"][-1]

# Get final accuracy from fast configuration.
fast_acc = history_fast.history["accuracy"][-1]

# Print a short comparison summary for learners.
print("Slow config - batch:", slow_batch_size,
      "loss:", round(float(slow_loss), 4),
      "acc:", round(float(slow_acc), 4))

# Print metrics for the efficient configuration.
print("Fast config - batch:", fast_batch_size,
      "loss:", round(float(fast_loss), 4),
      "acc:", round(float(fast_acc), 4))

# Explain that both configs learn but differ in throughput.
print("Both runs learn similarly, but fast config feeds faster.")




### **2.3. Pinned Memory Prefetching**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_02_03.jpg?v=1769764641" width="250">



>* Pinned memory cuts CPU‑to‑GPU transfer delays
>* Prefetching keeps GPU busy and speeds training

>* Pinned memory overlaps data transfer with computation
>* This prevents GPU idle time and improves throughput

>* Pinned memory improves speed but risks host pressure
>* Tune prefetch, batch size, workers using monitoring



In [None]:
#@title Python Code - Pinned Memory Prefetching

# This script shows pinned memory prefetching basics.
# We compare DataLoader settings for GPU data throughput.
# Focus on batch size pin_memory and non_blocking transfers.

# !pip install torch torchvision.

# Import required standard libraries.
import os
import random
import time

# Import torch and torchvision utilities.
import torch
import torchvision
import torchvision.transforms as T

# Set deterministic random seeds.
random.seed(0)
torch.manual_seed(0)

# Detect device preferring GPU when available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print torch version and selected device.
print("torch", torch.__version__, "device", device)

# Define a simple transform converting images to tensors.
transform = T.Compose([T.ToTensor()])

# Download a tiny MNIST training subset.
full_dataset = torchvision.datasets.MNIST(
    root="./data", train=True, download=True, transform=transform
)

# Select a small subset for quick profiling.
subset_size = 2048
indices = list(range(subset_size))
small_dataset = torch.utils.data.Subset(full_dataset, indices)

# Define a simple convolutional network.
class SmallCNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = torch.nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.pool = torch.nn.AdaptiveAvgPool2d((1, 1))
        self.fc = torch.nn.Linear(8, 10)

    def forward(self, x):
        x = torch.relu(self.conv(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


# Instantiate model and move to selected device.
model = SmallCNN().to(device)

# Define loss function and optimizer.
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Helper function to run one short training epoch.
def run_epoch(dataloader, use_non_blocking):
    model.train()
    start = time.time()
    total_loss = 0.0
    total_batches = 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        if batch_idx >= 20:
            break
        assert inputs.ndim == 4 and targets.ndim == 1
        inputs = inputs.to(device, non_blocking=use_non_blocking)
        targets = targets.to(device, non_blocking=use_non_blocking)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_batches += 1
    end = time.time()
    avg_loss = total_loss / max(total_batches, 1)
    return end - start, avg_loss

# Create baseline DataLoader without pinned memory.
batch_size = 128
loader_baseline = torch.utils.data.DataLoader(
    small_dataset, batch_size=batch_size, shuffle=True,
    num_workers=0, pin_memory=False
)

# Create optimized DataLoader with pinned memory enabled.
loader_pinned = torch.utils.data.DataLoader(
    small_dataset, batch_size=batch_size, shuffle=True,
    num_workers=2, pin_memory=True
)

# Warm up model and CUDA context if available.
_ = next(iter(loader_baseline))
if device.type == "cuda":
    dummy = torch.randn(1, 1, 28, 28, device=device)
    _ = model(dummy)

# Time baseline epoch without non_blocking transfers.
baseline_time, baseline_loss = run_epoch(loader_baseline, False)

# Time pinned memory epoch with non_blocking transfers.
pinned_time, pinned_loss = run_epoch(loader_pinned, True)

# Print concise comparison of timings and losses.
print("Baseline loader seconds", round(baseline_time, 3))
print("Pinned loader seconds", round(pinned_time, 3))
print("Baseline loss", round(baseline_loss, 4))
print("Pinned loss", round(pinned_loss, 4))




## **3. Precision Speed Tradeoffs**

### **3.1. Autocast for Mixed Precision**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_03_01.jpg?v=1769764767" width="250">



>* Autocast automates mixed precision for speed, memory
>* Chooses low or full precision per operation safely

>* Autocast speeds training and lowers memory use
>* Must test for rounding errors and stability risks

>* Autocast often keeps accuracy while boosting speed
>* Monitor metrics and tune usage for sensitive tasks



In [None]:
#@title Python Code - Autocast for Mixed Precision

# This script demonstrates mixed precision autocast tradeoffs.
# It compares speed and loss with and without autocast.
# Designed for small quick runs in Colab.

# !pip install torch torchvision.

# Import required standard libraries.
import time
import random
import numpy as np

# Import torch and check availability.
import torch
import torch.nn as nn

# Set deterministic random seeds.
seed_value = 42
random.seed(seed_value)

# Set numpy and torch seeds.
np.random.seed(seed_value)
torch.manual_seed(seed_value)

# Select device based on availability.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print framework version and device.
print("Torch version:", torch.__version__, "Device:", device)

# Define a tiny feedforward model.
class TinyNet(nn.Module):
    # Initialize linear layers and activation.
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.act = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Define forward computation.
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x

# Create small synthetic regression dataset.
num_samples, input_dim = 512, 64

# Generate random inputs and targets.
X = torch.randn(num_samples, input_dim)
y = torch.randn(num_samples, 1)

# Move data to selected device.
X = X.to(device)
y = y.to(device)

# Validate shapes before training.
assert X.shape == (num_samples, input_dim)
assert y.shape == (num_samples, 1)

# Helper function to run one training pass.
def run_epoch(model, optimizer, use_autocast):
    # Set model to training mode.
    model.train()
    criterion = nn.MSELoss()

    # Choose batch size for loop.
    batch_size = 64
    num_batches = num_samples // batch_size

    # Track cumulative loss value.
    total_loss = 0.0

    # Loop over mini batches.
    for i in range(num_batches):
        # Slice batch from tensors.
        xb = X[i * batch_size:(i + 1) * batch_size]
        yb = y[i * batch_size:(i + 1) * batch_size]

        # Zero gradients before backward.
        optimizer.zero_grad(set_to_none=True)

        # Use autocast context when requested.
        if use_autocast and device.type == "cuda":
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                preds = model(xb)
                loss = criterion(preds, yb)
        else:
            preds = model(xb)
            loss = criterion(preds, yb)

        # Backpropagate gradients.
        loss.backward()

        # Optimizer step updates weights.
        optimizer.step()

        # Accumulate detached loss value.
        total_loss += loss.detach().item()

    # Return average loss for epoch.
    return total_loss / max(num_batches, 1)

# Function to benchmark one configuration.
def benchmark_run(use_autocast):
    # Create fresh model instance.
    model = TinyNet(input_dim, 128, 1).to(device)

    # Use simple Adam optimizer.
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Warmup run without timing.
    _ = run_epoch(model, optimizer, use_autocast)

    # Start timing for one epoch.
    start_time = time.perf_counter()
    avg_loss = run_epoch(model, optimizer, use_autocast)
    elapsed = time.perf_counter() - start_time

    # Estimate memory usage if cuda.
    if device.type == "cuda":
        mem_bytes = torch.cuda.max_memory_allocated(device)
        mem_mb = mem_bytes / (1024 ** 2)
    else:
        mem_mb = 0.0

    # Return metrics dictionary.
    return avg_loss, elapsed, mem_mb

# Run benchmark without autocast first.
loss_fp32, time_fp32, mem_fp32 = benchmark_run(use_autocast=False)

# Run benchmark with autocast if possible.
loss_amp, time_amp, mem_amp = benchmark_run(use_autocast=True)

# Print concise comparison header.
print("\nMixed precision autocast comparison:")

# Print full precision metrics.
print("FP32 -> loss:", round(loss_fp32, 4), "time:", round(time_fp32, 4))

# Print autocast metrics.
print("AMP  -> loss:", round(loss_amp, 4), "time:", round(time_amp, 4))

# Print relative speedup information.
speedup = time_fp32 / time_amp if time_amp > 0 else 1.0
print("Speedup factor (FP32/AMP):", round(speedup, 3))

# Print simple memory comparison when available.
if device.type == "cuda":
    print("FP32 max memory MB:", round(mem_fp32, 2))
    print("AMP  max memory MB:", round(mem_amp, 2))

# Final line prints brief tradeoff summary.
print("Autocast trades tiny loss changes for speed and memory gains.")



### **3.2. Memory Usage Monitoring**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_03_02.jpg?v=1769764883" width="250">



>* Track GPU memory, not just latency, always
>* Watch peak usage as settings change for stability

>* Faster settings often sharply increase GPU memory use
>* Test memory on real data to avoid crashes

>* Memory-saving tricks affect speed and accuracy
>* Monitor memory to keep models stable and reliable



In [None]:
#@title Python Code - Memory Usage Monitoring

# This script shows basic memory usage monitoring.
# It uses TensorFlow to simulate model memory behavior.
# Focus on speed and memory tradeoffs with batches.

# !pip install tensorflow-2.20.0.

# Import required standard libraries.
import os
import random
import numpy as np

# Import TensorFlow and check version.
import tensorflow as tf

# Set deterministic random seeds.
seed_value = 42
random.seed(seed_value)

# Set numpy random seed for reproducibility.
np.random.seed(seed_value)

# Set TensorFlow random seed for reproducibility.
tf.random.set_seed(seed_value)

# Print TensorFlow version in one short line.
print("TensorFlow version:", tf.__version__)

# Detect GPU availability for potential memory monitoring.
physical_gpus = tf.config.list_physical_devices("GPU")

# Choose device string based on GPU presence.
if physical_gpus:
    device_name = "/GPU:0"
else:
    device_name = "/CPU:0"

# Print which device will be used.
print("Using device:", device_name)

# Define a small utility to get memory info safely.
def get_memory_info():
    # Handle GPU memory query if GPU exists.
    if physical_gpus:
        try:
            details = tf.config.experimental.get_memory_info("GPU:0")
        except Exception:
            return None
        return details
    # Return None when only CPU is available.
    return None

# Build a tiny dense model for demonstration.
def build_model():
    # Create a simple sequential dense network.
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(128,)),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(10, activation="softmax"),
    ])
    return model

# Create synthetic data with configurable batch size.
def make_data(batch_size):
    # Create random features with fixed dimension.
    x = tf.random.normal((batch_size, 128))
    # Create random integer labels for classes.
    y = tf.random.uniform((batch_size,), 0, 10, dtype=tf.int32)
    return x, y

# Run one training step and measure memory usage.
def run_step(model, optimizer, loss_fn, batch_size, mixed):
    # Prepare input batch and labels.
    x, y = make_data(batch_size)
    # Optionally use mixed precision autocast.
    if mixed:
        policy = tf.keras.mixed_precision.Policy("mixed_float16")
        tf.keras.mixed_precision.set_global_policy(policy)
    else:
        policy = tf.keras.mixed_precision.Policy("float32")
        tf.keras.mixed_precision.set_global_policy(policy)

    # Record memory before step if possible.
    before = get_memory_info()

    # Use GradientTape for one training step.
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss = loss_fn(y, logits)
    # Compute gradients and apply update.
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Record memory after step if possible.
    after = get_memory_info()

    # Compute simple metrics for reporting.
    loss_value = float(loss.numpy())
    return before, after, loss_value

# Nicely format memory information in megabytes.
def format_memory(info):
    # Return message when memory info is unavailable.
    if info is None:
        return "Memory info not available on this device."
    # Convert bytes to megabytes for readability.
    current_mb = info["current"] / (1024 * 1024)
    peak_mb = info["peak"] / (1024 * 1024)
    return f"current={current_mb:.1f}MB, peak={peak_mb:.1f}MB"

# Main demonstration comparing two configurations.
def main():
    # Build model and optimizer once for fairness.
    model = build_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

    # Define two configurations to compare.
    configs = [
        {"name": "small_batch_fp32", "batch": 32, "mixed": False},
        {"name": "large_batch_mixed", "batch": 128, "mixed": True},
    ]

    # Run each configuration and print summary.
    for cfg in configs:
        before, after, loss_value = run_step(
            model, optimizer, loss_fn, cfg["batch"], cfg["mixed"]
        )
        print("\nConfiguration:", cfg["name"])
        print("Batch size:", cfg["batch"], "Mixed precision:", cfg["mixed"])
        print("Loss value:", round(loss_value, 4))
        print("Before step memory:", format_memory(before))
        print("After step memory:", format_memory(after))

    # Provide short guidance on interpreting results.
    print("\nObserve how batch size and precision affect memory.")

# Execute main demonstration function.
main()




### **3.3. Numerical Stability Tradeoffs**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_03_03.jpg?v=1769765017" width="250">



>* Lower precision boosts speed but reduces accuracy
>* Can cause instability, especially in sensitive tasks

>* Watch for NaNs, exploding gradients, unstable metrics
>* Balance speed gains against reliability and reproducibility

>* Iterate tuning with validation, not speed alone
>* Choose precision per component using evidence-based tradeoffs



In [None]:
#@title Python Code - Numerical Stability Tradeoffs

# This script shows precision and stability tradeoffs.
# We compare float32 and float16 on a tiny example.
# Focus on speed, memory, and numerical behavior.

# Optional install for TensorFlow if missing.
# !pip install tensorflow==2.20.0 --quiet.

# Import required standard libraries.
import os
import time
import random

# Import numpy for numeric helpers.
import numpy as np

# Import tensorflow and check version.
import tensorflow as tf

# Set deterministic random seeds.
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Print TensorFlow version once.
print("TensorFlow version:", tf.__version__)

# Select device, prefer GPU when available.
physical_gpus = tf.config.list_physical_devices("GPU")
if physical_gpus:
    device_name = "/GPU:0"
else:
    device_name = "/CPU:0"

# Create a simple numeric stability test function.
def build_tiny_model(dtype):
    inputs = tf.keras.Input(shape=(10,), dtype=dtype)
    x = tf.keras.layers.Dense(
        32,
        activation="relu",
        dtype=dtype,
    )(inputs)
    outputs = tf.keras.layers.Dense(
        1,
        activation="linear",
        dtype=dtype,
    )(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.keras.optimizers.SGD(0.01),
        loss="mse",
        run_eagerly=False,
    )
    return model

# Generate a tiny synthetic regression dataset.
num_samples = 512
x_data = np.random.randn(num_samples, 10).astype("float32")
true_w = np.linspace(0.1, 1.0, 10).astype("float32")
y_data = x_data @ true_w + 0.5

# Validate shapes before training.
assert x_data.shape == (num_samples, 10)
assert y_data.shape == (num_samples,)

# Prepare datasets for float32 and float16 models.
train_ds_f32 = tf.data.Dataset.from_tensor_slices(
    (x_data.astype("float32"), y_data.astype("float32"))
).batch(64)

train_ds_f16 = tf.data.Dataset.from_tensor_slices(
    (x_data.astype("float16"), y_data.astype("float16"))
).batch(64)

# Build models with different numeric precision.
model_f32 = build_tiny_model("float32")
model_f16 = build_tiny_model("float16")

# Train float32 model and measure time.
with tf.device(device_name):
    start_f32 = time.time()
    history_f32 = model_f32.fit(
        train_ds_f32,
        epochs=5,
        verbose=0,
    )
    end_f32 = time.time()

# Train float16 model and measure time.
with tf.device(device_name):
    start_f16 = time.time()
    history_f16 = model_f16.fit(
        train_ds_f16,
        epochs=5,
        verbose=0,
    )
    end_f16 = time.time()

# Collect final losses for comparison.
final_loss_f32 = float(history_f32.history["loss"][-1])
final_loss_f16 = float(history_f16.history["loss"][-1])

# Check for obvious numerical issues.
has_nan_f32 = np.isnan(final_loss_f32)
has_nan_f16 = np.isnan(final_loss_f16)

# Print concise comparison summary.
print("Device used:", device_name)
print("float32 final loss:", round(final_loss_f32, 6))
print("float16 final loss:", round(final_loss_f16, 6))
print("float32 time seconds:", round(end_f32 - start_f32, 4))
print("float16 time seconds:", round(end_f16 - start_f16, 4))
print("float32 loss is NaN:", has_nan_f32)
print("float16 loss is NaN:", has_nan_f16)
print("Loss difference:", round(final_loss_f16 - final_loss_f32, 6))
print("Remember: faster precision may change loss behavior.")



# <font color="#418FDE" size="6.5" uppercase>**Profiling and Tuning**</font>


In this lecture, you learned to:
- Profile PyTorch models using torch.profiler to identify time‑consuming operations. 
- Apply basic performance optimizations such as adjusting batch size, using pin_memory, and enabling mixed precision. 
- Evaluate the trade‑offs between speed, memory usage, and numerical stability when tuning models. 

In the next Module (Module 8), we will go over 'Distributed Training'