# <font color="#418FDE" size="6.5" uppercase>**Profiling and Tuning**</font>

>Last update: 20260130.
    
By the end of this Lecture, you will be able to:
- Profile PyTorch models using torch.profiler to identify time‑consuming operations. 
- Apply basic performance optimizations such as adjusting batch size, using pin_memory, and enabling mixed precision. 
- Evaluate the trade‑offs between speed, memory usage, and numerical stability when tuning models. 


## **1. PyTorch Profiling Tools**

### **1.1. torch profiler essentials**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_01_01.jpg?v=1769830953" width="250">



>* PyTorch profiler records detailed model runtime behavior
>* Turns vague slowness into measurable, comparable data

>* Profiler wraps training steps to record events
>* Generates a timeline revealing bottlenecks and utilization

>* Profile only a few typical training iterations
>* Reuse focused profiles to compare optimization changes



In [None]:
#@title Python Code - torch profiler essentials

# This script shows basic torch profiler usage.
# It profiles a tiny model training step.
# Focus on essentials without overwhelming output.

# !pip install torch torchvision.

# Import required standard libraries.
import os
import random
import time

# Import torch and related utilities.
import torch
import torch.nn as nn
import torch.optim as optim

# Set deterministic random seeds everywhere.
seed_value = 42
random.seed(seed_value)

# Set torch manual seed for reproducibility.
torch.manual_seed(seed_value)

# Select device based on GPU availability.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print torch version and selected device.
print("Torch version and device:", torch.__version__, device)

# Define a tiny feedforward model class.
class TinyNet(nn.Module):
    # Initialize layers inside the constructor.
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()

        # Define first linear layer.
        self.fc1 = nn.Linear(input_dim, hidden_dim)

        # Define second linear layer.
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Define forward pass computation.
    def forward(self, x):
        x = torch.relu(self.fc1(x))

        # Apply second linear layer.
        x = self.fc2(x)

        # Return final output tensor.
        return x

# Create a small random dataset tensor.
input_dim, hidden_dim, output_dim = 32, 64, 10

# Define batch size for synthetic data.
batch_size = 64

# Create random input features tensor.
inputs = torch.randn(batch_size, input_dim)

# Create random integer labels tensor.
labels = torch.randint(0, output_dim, (batch_size,))

# Validate shapes before moving to device.
assert inputs.shape == (batch_size, input_dim)

# Move tensors to selected device.
inputs = inputs.to(device)
labels = labels.to(device)

# Initialize model and move to device.
model = TinyNet(input_dim, hidden_dim, output_dim).to(device)

# Define loss function and optimizer.
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Warm up model with one forward backward pass.
model.train()

# Perform warmup forward pass.
outputs = model(inputs)

# Compute warmup loss value.
loss = criterion(outputs, labels)

# Backpropagate warmup gradients.
loss.backward()

# Optimizer step for warmup.
optimizer.step()

# Zero gradients after warmup step.
optimizer.zero_grad()

# Import torch profiler utilities.
from torch.profiler import profile, record_function, ProfilerActivity

# Define a helper function for one training step.
def train_step(data_inputs, data_labels):
    # Ensure model is in training mode.
    model.train()

    # Forward pass through the model.
    logits = model(data_inputs)

    # Compute loss for this batch.
    loss_value = criterion(logits, data_labels)

    # Backward pass for gradients.
    loss_value.backward()

    # Optimizer update step.
    optimizer.step()

    # Reset gradients to zero.
    optimizer.zero_grad()

    # Return scalar loss value.
    return loss_value.item()

# Configure profiler activities and schedule.
activities = [ProfilerActivity.CPU]

# Add CUDA activity if GPU is available.
if device.type == "cuda":
    activities.append(ProfilerActivity.CUDA)

# Define profiler schedule with warmup and active steps.
prof_schedule = torch.profiler.schedule(
    wait=1,
    warmup=1,
    active=2,
    repeat=1,
)

# Create profiler context manager instance.
profiler = profile(
    activities=activities,
    schedule=prof_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(
        "./profiler_logs"
    ),
    record_shapes=True,
    profile_memory=True,
    with_stack=False,
)

# Run a few profiled training iterations.
num_steps = 4

# Start profiler context block.
with profiler as prof:
    for step in range(num_steps):
        # Use record_function to label region.
        with record_function("train_step_region"):
            loss_value = train_step(inputs, labels)

        # Advance profiler internal step.
        prof.step()

# Print a short summary of top operations.
print("Profiling finished, showing top operations by time.")

# Sort profiler events by self cpu time total.
key = torch.profiler.ProfilerActivity.CPU

# Use key to avoid long default tables.
summary = profiler.key_averages().table(
    sort_by="self_cpu_time_total",
    row_limit=5,
)

# Print the small profiler summary table.
print(summary)

# Print final loss value for reference.
print("Final training step loss value:", float(loss_value))




### **1.2. Chrome Trace Viewer**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_01_02.jpg?v=1769831014" width="250">



>* Shows timeline of threads and GPU streams
>* Helps visually spot bottlenecks and idle gaps

>* Shows how CPU, GPU, data transfers interact
>* Helps distinguish slow layers from data bottlenecks

>* Find subtle issues like launch overhead, synchronization
>* Iteratively profile traces to refine performance intuition



In [None]:
#@title Python Code - Chrome Trace Viewer

# This script demonstrates Chrome Trace Viewer usage simply.
# It creates a tiny TensorFlow model and profile trace.
# Then it explains how to open the trace visually.

# !pip install tensorflow.

# Import required standard and TensorFlow modules.
import os
import json
import random

# Import numpy and tensorflow with clear aliases.
import numpy as np
import tensorflow as tf

# Set deterministic random seeds for reproducibility.
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Print TensorFlow version in a single concise line.
print("TensorFlow version:", tf.__version__)

# Check and print whether a GPU device is available.
physical_gpus = tf.config.list_physical_devices("GPU")
print("GPU available:", bool(physical_gpus))

# Create a tiny synthetic dataset for quick profiling.
num_samples = 256
input_dim = 32
num_classes = 10

# Generate random input features and integer labels.
features = np.random.randn(num_samples, input_dim).astype(np.float32)
labels = np.random.randint(num_classes, size=(num_samples,)).astype(np.int32)

# Validate shapes before building the model.
assert features.shape == (num_samples, input_dim)
assert labels.shape == (num_samples,)

# Build a very small dense neural network model.
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_dim,)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(num_classes, activation="softmax"),
])

# Compile the model with simple optimizer and loss.
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)

# Create a tf.data dataset with small batch size.
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
dataset = dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# Define a short training step function for profiling.
@tf.function

def train_step(batch_features, batch_labels):
    with tf.GradientTape() as tape:
        predictions = model(batch_features, training=True)
        loss_value = model.compiled_loss(batch_labels, predictions)

    gradients = tape.gradient(loss_value, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss_value


# Prepare a directory for TensorFlow profiler trace files.
logdir = "tf_profile_logs"
os.makedirs(logdir, exist_ok=True)

# Choose a small number of steps to keep runtime short.
max_steps = 5
step_counter = 0

# Start TensorFlow profiler context to capture Chrome trace.
options = tf.profiler.experimental.ProfilerOptions(
    host_tracer_level=2,
    python_tracer_level=1,
    device_tracer_level=1,
)

# Use profiler context manager around a short training loop.
tf.profiler.experimental.start(logdir, options=options)
for batch_features, batch_labels in dataset:
    loss_value = train_step(batch_features, batch_labels)
    step_counter += 1

    if step_counter >= max_steps:
        break

# Stop the profiler after the small training loop.
tf.profiler.experimental.stop()

# Print a short summary explaining where traces were saved.
print("Profiler trace directory:", os.path.abspath(logdir))

# Explain how to open the trace using TensorBoard profile plugin.
print("To view Chrome trace, run: tensorboard --logdir", logdir)

# Provide final hint about Chrome Trace Viewer timeline interpretation.
print("Then open the profile tab and inspect CPU and GPU timelines.")



### **1.3. Reading Operator Metrics**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_01_03.jpg?v=1769831064" width="250">



>* Operator view shows each operation and timings
>* Find operators dominating runtime to target optimization

>* Compare self time and total time meanings
>* Sort by each to reveal true bottleneck operators

>* Use counts and device breakdowns to spot bottlenecks
>* Differentiate compute versus input limits and optimize



In [None]:
#@title Python Code - Reading Operator Metrics

# This script demonstrates reading simple operator metrics.
# We simulate profiling style metrics using plain Python structures.
# Focus on understanding which operations dominate total runtime.

# No extra installations are required for this simple demonstration.
# Uncomment and adapt pip commands here if additional packages needed.
# This script is designed for quick execution in Google Colab.

# Define a small list of fake operator metric dictionaries.
operator_metrics = [
    {"name": "conv2d", "calls": 120, "self_ms": 2.5, "total_ms": 320.0},
    {"name": "relu", "calls": 120, "self_ms": 0.3, "total_ms": 36.0},
    {"name": "matmul", "calls": 40, "self_ms": 4.0, "total_ms": 160.0},
    {"name": "host_to_device", "calls": 80, "self_ms": 1.0, "total_ms": 90.0},
]

# Compute total runtime across all operators for percentage calculations.

total_runtime_ms = sum(op["total_ms"] for op in operator_metrics)

# Sort operators by total time descending to see dominant contributors.

sorted_by_total = sorted(
    operator_metrics,
    key=lambda op: op["total_ms"],
    reverse=True,
)

# Print a compact header explaining the displayed metrics.

print("Name  Calls  Self_ms  Total_ms  Total_percent")

# Loop through sorted operators and print key metrics per operator.

for op in sorted_by_total:
    percent = (op["total_ms"] / total_runtime_ms) * 100.0
    print(
        f"{op['name']:10s} {op['calls']:5d} {op['self_ms']:7.2f} "
        f"{op['total_ms']:8.2f} {percent:13.1f}"
    )

# Identify operators where self time is close to total time.

heavy_kernels = [
    op for op in operator_metrics
    if op["total_ms"] > 0 and (op["self_ms"] / op["total_ms"]) > 0.7
]

# Print a short summary highlighting intrinsically expensive kernels.

print("\nOperators where self time dominates total time:")
for op in heavy_kernels:
    print(f"Kernel {op['name']} is doing most of its own work.")




## **2. Boosting Data Throughput**

### **2.1. Optimal Batch Sizing**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_02_01.jpg?v=1769831100" width="250">



>* Batch size must keep the GPU busy
>* Too big risks memory limits and slowdowns

>* Batch size sweet spot depends on setup
>* Increase batch gradually, watch speed and memory

>* Batch size affects speed, memory, and convergence
>* Use mixed precision and not-maximal batches for stability



In [None]:
#@title Python Code - Optimal Batch Sizing

# This script explores optimal batch sizing.
# We compare throughput for different batch sizes.
# Use this to understand speed memory tradeoffs.
# !pip install torch torchvision.

# Import required standard and torch modules.
import torch
import torch.nn as nn
import torch.utils.data as data

# Set deterministic random seed for reproducibility.
torch.manual_seed(0)

# Select device preferring GPU when available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a tiny convolutional network model.
class TinyCNN(nn.Module):
    # Initialize convolutional and linear layers.
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(1, 8, kernel_size=3)
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(5408, 10)

    # Define forward pass through layers.
    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x


# Create a small random dataset for demonstration.
class RandomImages(data.Dataset):
    # Initialize dataset with fixed size.
    def __init__(self, length):
        self.length = length
        self.images = torch.randn(length, 1, 28, 28)
        self.labels = torch.randint(0, 10, (length,))

    # Return dataset length when requested.
    def __len__(self):
        return self.length

    # Get one sample image and label pair.
    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]


# Instantiate dataset with modest number of samples.
dataset = RandomImages(length=512)

# Validate one sample shape and label type.
sample_x, sample_y = dataset[0]
assert sample_x.shape == (1, 28, 28)
assert sample_y.dtype == torch.int64


# Create model and move it to selected device.
model = TinyCNN().to(device)

# Define loss function and optimizer objects.
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


# Function to measure throughput for one batch size.
def measure_throughput(batch_size):
    # Create data loader with given batch size.
    loader = data.DataLoader(dataset, batch_size=batch_size,
                             shuffle=False, pin_memory=True)

    # Warmup single batch to stabilize timings.
    model.train()
    for images, labels in loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.zero_grad()
        break

    # Synchronize device before timing loop.
    if device.type == "cuda":
        torch.cuda.synchronize()

    # Measure time for one full pass.
    start = torch.cuda.Event(enable_timing=True) if device.type == "cuda" else None
    end = torch.cuda.Event(enable_timing=True) if device.type == "cuda" else None

    # Record start event when using cuda device.
    if device.type == "cuda":
        start.record()

    # Run one epoch like pass over loader.
    total_samples = 0
    for images, labels in loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.zero_grad()
        total_samples += images.size(0)

    # Record end event and synchronize.
    if device.type == "cuda":
        end.record()
        torch.cuda.synchronize()
        elapsed_ms = start.elapsed_time(end)
        elapsed_seconds = elapsed_ms / 1000.0
    else:
        # Fallback simple cpu timing measurement.
        import time
        start_time = time.time()
        elapsed_seconds = time.time() - start_time

    # Avoid division by zero in throughput.
    elapsed_seconds = max(elapsed_seconds, 1e-6)

    # Compute samples processed per second.
    throughput = total_samples / elapsed_seconds
    return throughput


# Define several candidate batch sizes to compare.
batch_sizes = [8, 32, 64, 128]

# Print framework version and device information.
print("PyTorch version:", torch.__version__, "Device:", device)

# Measure throughput for each candidate batch size.
results = []
for bs in batch_sizes:
    throughput = measure_throughput(bs)
    results.append((bs, throughput))

# Sort results by batch size for readability.
results.sort(key=lambda x: x[0])

# Display concise throughput comparison table.
print("Batch size and approximate samples per second:")
for bs, thr in results:
    print("Batch:", bs, "Throughput:", round(thr, 2))

# Suggest best batch size based on throughput.
best_bs, best_thr = max(results, key=lambda x: x[1])
print("Best batch size here is:", best_bs, "with throughput:", round(best_thr, 2))



### **2.2. Efficient DataLoader Configuration**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_02_02.jpg?v=1769831178" width="250">



>* Configure DataLoader so GPU never waits
>* Tune workers, prefetching, and I/O for hardware

>* Choose worker count to avoid GPU stalling
>* Tune workers empirically; depends on hardware, dataset

>* Shuffle and batch data to reduce bottlenecks
>* Tune collation and prefetching to balance memory



In [None]:
#@title Python Code - Efficient DataLoader Configuration

# This script compares DataLoader configurations throughput.
# It shows workers and pin memory effects.
# It uses a tiny synthetic dataset.

# !pip install torch torchvision.

# Import required standard and torch modules.
import os
import time
import random

# Import torch core and utilities.
import torch
from torch import nn

# Import DataLoader and TensorDataset helpers.
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

# Set deterministic random seeds for reproducibility.
random.seed(0)
torch.manual_seed(0)

# Detect device preferring cuda then mps then cpu.
if torch.cuda.is_available():
    device = torch.device("cuda")
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# Print torch version and selected device name.
print("torch version:", torch.__version__, "device:", device.type)

# Define a tiny synthetic dataset for demonstration.
num_samples = 4096
num_features = 128

# Create random input features tensor.
features = torch.randn(num_samples, num_features)

# Create random integer labels tensor.
labels = torch.randint(low=0, high=10, size=(num_samples,))

# Wrap tensors inside a TensorDataset object.
dataset = TensorDataset(features, labels)

# Define a simple linear model for timing.
model = nn.Linear(num_features, 10).to(device)

# Put model in evaluation mode for consistent behavior.
model.eval()

# Define a helper function to measure one epoch time.
def measure_epoch_time(loader, description):
    start_time = time.time()
    total_batches = 0

    # Disable gradients for pure inference timing.
    with torch.no_grad():
        for batch_features, batch_labels in loader:
            total_batches += 1

            # Move batch to device respecting pin memory.
            batch_features = batch_features.to(device, non_blocking=True)

            # Run a forward pass through the model.
            outputs = model(batch_features)

    # Compute elapsed time and average per batch.
    elapsed = time.time() - start_time
    avg_per_batch = elapsed / max(total_batches, 1)

    # Print a short summary line for this configuration.
    print(description, "batches:", total_batches, "sec_per_batch:", round(avg_per_batch, 5))

# Create a baseline DataLoader with one worker.
batch_size = 64
baseline_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=False,
    persistent_workers=False,
)

# Create a DataLoader using more workers without pin memory.
workers_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=False,
    persistent_workers=False,
)

# Create a DataLoader using workers and pin memory.
pin_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=(device.type == "cuda"),
    persistent_workers=False,
)

# Warm up model and device with a quick pass.
for _ in range(2):
    for batch_features, batch_labels in baseline_loader:
        batch_features = batch_features.to(device, non_blocking=True)
        _ = model(batch_features)
        break

# Measure and compare epoch times for each configuration.
measure_epoch_time(baseline_loader, "baseline_workers_0_pin_False")

# Measure configuration with more workers only.
measure_epoch_time(workers_loader, "workers_2_pin_False")

# Measure configuration with workers and pin memory.
measure_epoch_time(pin_loader, "workers_2_pin_auto")




### **2.3. Pinned Memory Prefetching**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_02_03.jpg?v=1769831231" width="250">



>* Pinned memory speeds CPU‑to‑GPU data transfers
>* Overlapping transfer and compute keeps GPU busy

>* Slow CPU‑GPU transfers can bottleneck fast models
>* Pinned prefetching smooths GPU usage and shortens epochs

>* Pinned memory helps but can hurt systems
>* Start moderate, monitor usage, tune batches carefully



In [None]:
#@title Python Code - Pinned Memory Prefetching

# This script shows pinned memory prefetching basics.
# We compare DataLoader settings for GPU data throughput.
# Run on CPU safely when no GPU exists.

# !pip install torch torchvision.

# Import required standard and torch modules.
import os
import random
import time

# Import torch and torchvision utilities.
import torch
import torchvision
import torchvision.transforms as T

# Set deterministic random seeds for reproducibility.
random.seed(0)
torch.manual_seed(0)

# Detect GPU availability for this runtime.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print torch version and device once.
print("Torch version and device:", torch.__version__, device)

# Define a simple transform converting images to tensors.
transform = T.Compose([T.ToTensor()])

# Download a tiny MNIST training subset.
train_dataset = torchvision.datasets.MNIST(
    root="./data",
    train=True,
    download=True,
    transform=transform,
)

# Keep only a small subset for faster demonstration.
subset_size = 512
indices = list(range(subset_size))
train_subset = torch.utils.data.Subset(train_dataset, indices)


# Helper function building a DataLoader with given options.
def make_loader(pin_memory, num_workers, batch_size):
    loader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory,
    )

    return loader


# Define a tiny model to create some GPU work.
class TinyNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = torch.nn.Flatten()
        self.fc = torch.nn.Linear(28 * 28, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc(x)
        return x


# Instantiate model and move to selected device.
model = TinyNet().to(device)

# Define a simple loss function and optimizer.
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)


# Training loop helper measuring one epoch time.
def run_epoch(loader, description):
    model.train()
    start = time.time()
    total_batches = 0

    for images, labels in loader:
        if device.type == "cuda":
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
        else:
            images = images.to(device)
            labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_batches += 1

    end = time.time()
    elapsed = end - start
    print(description, "batches:", total_batches, "seconds:", round(elapsed, 3))


# Create a baseline loader without pinned memory.
batch_size = 64
num_workers = 2
loader_no_pin = make_loader(pin_memory=False, num_workers=num_workers, batch_size=batch_size)

# Create an optimized loader using pinned memory.
loader_pin = make_loader(pin_memory=True, num_workers=num_workers, batch_size=batch_size)

# Warm up model and loaders once to avoid cold start.
run_epoch(loader_no_pin, "Warmup no pin")

# Time one short epoch without pinned memory.
run_epoch(loader_no_pin, "No pinned memory")

# Time one short epoch with pinned memory enabled.
run_epoch(loader_pin, "With pinned memory")

# Print a short explanation summarizing the comparison.
print("Compare times to see data transfer impact.")



## **3. Precision Speed Tradeoffs**

### **3.1. Autocast for Mixed Precision**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_03_01.jpg?v=1769831288" width="250">



>* Autocast automatically chooses safe lower precision operations
>* Gives speed and memory gains without manual precision management

>* Autocast speeds models by using low precision
>* Actual gains vary, so always profile performance

>* Mixed precision can introduce numerical errors, instability
>* Balance speed, memory savings, and required numerical reliability



In [None]:
#@title Python Code - Autocast for Mixed Precision

# This script demonstrates PyTorch autocast mixed precision tradeoffs.
# It compares speed and loss for float32 versus mixed precision.
# Designed for quick safe execution inside Google Colab.

# !pip install torch torchvision.

# Import required standard and torch modules.
import time
import random
import torch

# Set deterministic random seeds for reproducibility.
random.seed(0)
torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print framework version and selected device.
print("PyTorch version:", torch.__version__, "Device:", device)

# Define a tiny convolutional network for demonstration.
class TinyNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = torch.nn.Conv2d(1, 8, 3, padding=1)
        self.relu = torch.nn.ReLU()
        self.fc = torch.nn.Linear(8 * 28 * 28, 10)

    def forward(self, x):
        x = self.conv(x)
        x = self.relu(x)
        x = x.view(x.size(0), -1)

        return self.fc(x)

# Create a small random dataset tensor and labels.
batch_size = 64
input_shape = (batch_size, 1, 28, 28)
inputs = torch.randn(input_shape, device=device)
labels = torch.randint(0, 10, (batch_size,), device=device)

# Validate shapes before training loop.
assert inputs.shape == input_shape
assert labels.shape[0] == batch_size

# Instantiate model and optimizer for float32 baseline.
model_fp32 = TinyNet().to(device)
optimizer_fp32 = torch.optim.SGD(model_fp32.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Run a few float32 training steps and measure time.
steps = 20
start_time = time.time()
for _ in range(steps):
    optimizer_fp32.zero_grad()
    outputs = model_fp32(inputs)
    loss = criterion(outputs, labels)
    loss.backward()

    optimizer_fp32.step()

fp32_time = time.time() - start_time
fp32_loss = float(loss.detach().cpu())

# Instantiate model and optimizer for mixed precision run.
model_mp = TinyNet().to(device)
optimizer_mp = torch.optim.SGD(model_mp.parameters(), lr=0.01)
scaler = torch.cuda.amp.GradScaler(enabled=device.type == "cuda")

# Run the same steps using autocast mixed precision.
start_time = time.time()
for _ in range(steps):
    optimizer_mp.zero_grad()
    with torch.cuda.amp.autocast(enabled=device.type == "cuda"):
        outputs = model_mp(inputs)
        loss = criterion(outputs, labels)

    scaler.scale(loss).backward()
    scaler.step(optimizer_mp)
    scaler.update()

mp_time = time.time() - start_time
mp_loss = float(loss.detach().cpu())

# Estimate memory usage by parameter dtype sizes.
fp32_params = sum(p.numel() for p in model_fp32.parameters())
mp_params = sum(p.numel() for p in model_mp.parameters())
bytes_fp32 = fp32_params * 4
bytes_mp = mp_params * 4

# Print concise comparison of speed and numerical behavior.
print("Float32 time seconds:", round(fp32_time, 4))
print("Mixed precision time seconds:", round(mp_time, 4))
print("Float32 final loss value:", round(fp32_loss, 4))
print("Mixed precision final loss value:", round(mp_loss, 4))
print("Parameter memory bytes both modes:", bytes_fp32, bytes_mp)
print("Loss difference absolute value:", round(abs(fp32_loss - mp_loss), 6))
print("Note mixed precision may trade tiny accuracy for speed.")




### **3.2. Memory footprint checks**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_03_02.jpg?v=1769831338" width="250">



>* Track GPU memory while chasing more speed
>* Treat memory as a budget, justify every increase

>* Transient activations often dominate GPU memory usage
>* Monitor peak memory to choose stable configurations

>* Mixed precision lowers memory, enabling larger models
>* Extra states add cost, so verify peak memory



In [None]:
#@title Python Code - Memory footprint checks

# This script shows simple memory footprint checks.
# We compare batch sizes and mixed precision memory usage.
# Focus on clear prints and tiny synthetic data.

# !pip install tensorflow==2.20.0.

# Import required modules for TensorFlow and system inspection.
import os, random, psutil, numpy as np, tensorflow as tf

# Set deterministic seeds for reproducible tiny experiment.
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Print TensorFlow version and available physical devices.
print("TensorFlow version:", tf.__version__)
print("Physical devices:", tf.config.list_physical_devices())
process = psutil.Process(os.getpid())

# Helper function to report current and peak memory usage.
def report_memory(label):
    info = process.memory_info()
    used_mb = info.rss / (1024 * 1024)
    print(label, "memory_mb:", round(used_mb, 2))


# Create a tiny synthetic dataset with small feature size.
num_samples, num_features, num_classes = 256, 32, 3
x_data = np.random.randn(num_samples, num_features).astype("float32")
y_data = np.random.randint(0, num_classes, size=(num_samples,))

# Validate shapes before building the model and training.
assert x_data.shape == (num_samples, num_features)
assert y_data.shape == (num_samples,)
report_memory("After data creation")

# Build a very small dense model for classification.
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(num_features,)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(num_classes, activation="softmax"),
])

# Compile the model with a simple optimizer and loss.
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)
report_memory("After model creation")

# Function to run one short training and report memory usage.
def run_experiment(batch_size, use_mixed):
    policy_name = "mixed_float16" if use_mixed else "float32"
    tf.keras.mixed_precision.set_global_policy(policy_name)
    temp_model = tf.keras.models.clone_model(model)

    temp_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    report_memory(
        f"Before training batch={batch_size} policy={policy_name}"
    )
    temp_model.fit(
        x_data,
        y_data,
        epochs=1,
        batch_size=batch_size,
        verbose=0,
    )

    report_memory(
        f"After training batch={batch_size} policy={policy_name}"
    )


# Run experiments with two batch sizes and two precision policies.
run_experiment(batch_size=16, use_mixed=False)
run_experiment(batch_size=64, use_mixed=False)
run_experiment(batch_size=16, use_mixed=True)
run_experiment(batch_size=64, use_mixed=True)




### **3.3. Stability considerations**

<img src="https://cdn.jsdelivr.net/gh/mhrafiei/contents@main/LFF/Master PyTorch 2.10.0/Module_07/Lecture_B/image_03_03.jpg?v=1769831392" width="250">



>* Speed optimizations and lower precision can harm stability
>* Watch for NaNs, exploding losses, and unreliable predictions

>* Watch precision-sensitive layers like softmax and normalization
>* Profile losses and metrics; keep fragile ops high-precision

>* Use experiments and stress tests to compare precisions
>* Prioritize stability alongside speed, especially in safety-critical tasks



In [None]:
#@title Python Code - Stability considerations

# This script shows precision stability considerations simply.
# We compare float32 and float16 training stability carefully.
# Watch losses and NaN checks for both precisions.

# !pip install tensorflow==2.20.0.

# Import required modules and set deterministic seeds.
import os, random, numpy as np, tensorflow as tf

# Print TensorFlow version for reproducibility reference.
print("TensorFlow version:", tf.__version__)

# Set seeds for reproducible random behavior everywhere.
seed_value = 7
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Select device preferring GPU when it is available.
physical_gpus = tf.config.list_physical_devices("GPU")
if physical_gpus:
    device_name = "/GPU:0"
else:
    device_name = "/CPU:0"

print("Using device:", device_name)

# Load MNIST dataset and keep a very small subset.
(x_train, y_train), _ = tf.keras.datasets.mnist.load_data()

# Normalize images and add channel dimension safely.
x_train = x_train.astype("float32") / 255.0
x_train = np.expand_dims(x_train, axis=-1)

# Keep only a tiny subset for quick demonstration.
small_size = 512
x_small = x_train[:small_size]
y_small = y_train[:small_size]

# Validate shapes to avoid unexpected broadcasting issues.
print("Subset shape:", x_small.shape, y_small.shape)

# Build a simple CNN model factory with configurable dtype.
def build_model(dtype):
    inputs = tf.keras.Input(shape=(28, 28, 1), dtype=dtype)
    x = tf.keras.layers.Conv2D(8, (3, 3), activation="relu")(inputs)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(32, activation="relu")(x)
    outputs = tf.keras.layers.Dense(10, activation="softmax")(x)
    model = tf.keras.Model(inputs, outputs)
    return model

# Prepare datasets with small batch size for stability.
batch_size = 64
train_ds = tf.data.Dataset.from_tensor_slices((x_small, y_small))
train_ds = train_ds.shuffle(buffer_size=small_size, seed=seed_value)
train_ds = train_ds.batch(batch_size)

# Create float32 model and optimizer for baseline stability.
model_fp32 = build_model("float32")
optimizer_fp32 = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

# Create float16 model and optimizer for speed tradeoff.
policy = tf.keras.mixed_precision.Policy("mixed_float16")
tf.keras.mixed_precision.set_global_policy(policy)
model_fp16 = build_model("float16")
optimizer_fp16 = tf.keras.mixed_precision.LossScaleOptimizer(
    tf.keras.optimizers.Adam(learning_rate=1e-3)
)

# Function to run one short training epoch and track stability.
def run_epoch(model, optimizer, dataset, use_mixed):
    losses = []
    nan_count = 0

    for step, (images, labels) in enumerate(dataset):
        with tf.device(device_name):
            with tf.GradientTape() as tape:
                if use_mixed:
                    images_cast = tf.cast(images, tf.float16)
                else:
                    images_cast = tf.cast(images, tf.float32)

                logits = model(images_cast, training=True)
                loss = loss_fn(labels, logits)

                if use_mixed:
                    # scaled_loss = optimizer.get_scaled_loss(loss)
                    scaled_loss = loss
                else:
                    scaled_loss = loss

        if use_mixed:
            # scaled_grads = tape.gradient(scaled_loss, model.trainable_variables)
            # grads = optimizer.get_unscaled_gradients(scaled_grads)
            grads = tape.gradient(scaled_loss, model.trainable_variables)
        else:
            grads = tape.gradient(scaled_loss, model.trainable_variables)

        if any([tf.reduce_any(tf.math.is_nan(g)) for g in grads if g is not None]):
            nan_count += 1

        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        losses.append(float(loss.numpy()))

    return losses, nan_count

# Run one epoch for float32 model and collect statistics.
losses32, nans32 = run_epoch(model_fp32, optimizer_fp32, train_ds, False)

# Run one epoch for float16 mixed precision model.
losses16, nans16 = run_epoch(model_fp16, optimizer_fp16, train_ds, True)

# Print concise comparison of stability related statistics.
print("float32 loss mean:", np.mean(losses32), "NaN gradients:", nans32)
print("float16 loss mean:", np.mean(losses16), "NaN gradients:", nans16)
print("float32 loss std:", np.std(losses32))
print("float16 loss std:", np.std(losses16))

# Show simple conclusion about precision and stability tradeoffs.
print("Notice how mixed precision can change loss behavior and gradient stability.")



# <font color="#418FDE" size="6.5" uppercase>**Profiling and Tuning**</font>


In this lecture, you learned to:
- Profile PyTorch models using torch.profiler to identify time‑consuming operations. 
- Apply basic performance optimizations such as adjusting batch size, using pin_memory, and enabling mixed precision. 
- Evaluate the trade‑offs between speed, memory usage, and numerical stability when tuning models. 

In the next Module (Module 8), we will go over 'Distributed Training'