### Section 8.7 Batching

In this code, we demonstrate how to utilize the NeighborLoader class for efficient
batch processing of graph data. Additionally, we will track the system's performance
including memory usage and time efficiency using libraries like pynvml, thop, and time.

This code is organized into four major parts:


1.   Install and import the required packages and datasets
2.   Data preparation and loading
3.   Model setup
4.   Model training with performance tracking

-------------------------------------------------------------------

# Part I. Install Packages, Load Data and Import Packages


In [None]:
# Find the CUDA version PyTorch was installed with
!python -c "import torch; print(torch.version.cuda)"

In [None]:
# PyTorch version
!python -c "import torch; print(torch.__version__)"

In [None]:
# Use the above information to fill in the http address below
# %%capture
!pip install ogb pyg-lib torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.0.1+cu118.html
!pip install torch-geometric

In [None]:
!pip install gputil
!pip install nvidia-ml-py3
!pip install thop

In [None]:
# Standard library imports
import collections
import gc
import logging
import os
import time

# Third-party library imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import psutil
import pynvml
from thop import clever_format, profile

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
from scipy.special import softmax
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.loader import NeighborLoader, NeighborSampler
from torch_geometric.nn import GCNConv, SAGEConv
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os.path as osp



This section is dedicated to preparing and processing the dataset. It loads the Amazon products dataset, creates training, validation, and test masks, and sets up NeighborLoaders to load batches of graph data during training and evaluation.


In [None]:
# download and loading the obg dataset
root = osp.join(osp.dirname(osp.realpath('./')), 'data', 'products')
# dataset_sparse = PygNodePropPredDataset( name='ogbn-products', transform=T.ToSparseTensor())

In [None]:
dataset_dense = PygNodePropPredDataset( name='ogbn-products')
dataset_dense.num_classes

In this code snippet:

1.  The dataset is split into training, testing, and validation subsets.
2.  Boolean masks are created for each subset, which allows for easy and efficient access to these subsets during the model training and evaluation stages.
3.  These masks are then assigned to the data_dense object, enhancing the ease of accessing and manipulating the various data subsets.

In [None]:
# Extracting the first element from the dataset in dense format
data_dense = dataset_dense[0]

# Splitting the dataset into training, validation, and test sets
split_idx = dataset_dense.get_idx_split()

# Extracting indices for training, validation, and test sets
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

# Creating boolean masks for easy data segmentation
train_mask = torch.zeros(data_dense.num_nodes, dtype=torch.bool)
train_mask[train_idx] = True

test_mask = torch.zeros(data_dense.num_nodes, dtype=torch.bool)
test_mask[test_idx] = True

valid_mask = torch.zeros(data_dense.num_nodes, dtype=torch.bool)
valid_mask[valid_idx] = True

# Assigning masks to the dense data object for easy access to data subsets
data_dense.train_mask = train_mask
data_dense.test_mask = test_mask
data_dense.valid_mask = valid_mask


In [None]:
# Creating NeighborLoaders
# The `train_loader` is optimized for training speed with multiple neighbors and large batch size.
# The `valid_loader` considers all neighbors for accuracy in the model validation phase.
# The `test_loader` evaluates the model's final performance on unseen data with a broad neighborhood context.

loader = NeighborLoader(
    data_dense, num_neighbors=[20, 15, 10], batch_size=1280*5,
    input_nodes=train_idx, shuffle=True, num_workers=6
)

valid_loader = NeighborLoader(
    data_dense, num_neighbors=[-1], batch_size=256,
    input_nodes=valid_idx, num_workers=6
)

test_loader = NeighborLoader(
    data_dense, num_neighbors=[50] * 2, batch_size=256,
    input_nodes=test_idx, num_workers=6
)



In [None]:
# Load the OGB evaluator for the dataset
evaluator = Evaluator(name='ogbn-products')

# Establish the device for model training 'cuda' if GPU, 'cpu' otherwise
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

# Confirm the device. If it's a GPU, 'cuda' will print
print('Device: {}'.format(device))

# Part III. Model Setup

In this part, the GraphSAGE model is defined. It's a PyTorch module consisting of several GraphSAGE convolutional layers followed by activation and dropout operations. The model is meant to be trained on the node property prediction task.

In [None]:


# Define the GraphSAGE model
class GraphSAGE_dense(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, dropout=0.2):
        """
        Initialize the GraphSAGE_dense model.

        Parameters:
        - in_dim (int): The size of the input feature dimension.
        - hidden_dim (int): The size of the hidden layer dimension.
        - out_dim (int): The size of the output layer dimension.
        - dropout (float): The dropout rate for regularization, default is 0.2.
        """

        # Call the constructor of the parent class (torch.nn.Module)
        super(GraphSAGE_dense, self).__init__()

        # Store dropout rate
        self.dropout = dropout

        # Create the first GraphSAGE convolution layer
        self.conv1 = SAGEConv(in_dim, hidden_dim)

        # Create the second GraphSAGE convolution layer
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)

        # Create the third GraphSAGE convolution layer
        self.conv3 = SAGEConv(hidden_dim, out_dim)

    def forward(self, x, edge_index):
        """
        Forward propagation of the model.

        Parameters:
        - x (torch.Tensor): The input features of nodes.
        - edge_index (torch.Tensor): The edge indices.

        Returns:
        - x (torch.Tensor): The output features of nodes.
        """

        # First GraphSAGE layer followed by an Exponential Linear Unit (ELU) activation and dropout
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)

        # Second GraphSAGE layer followed by an ELU activation and dropout
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout)

        # Third GraphSAGE layer to produce the output
        x = self.conv3(x, edge_index)

        return x  # Return the output features of nodes



# Part IV. Model Training

For the exploratory data analysis, we will:


1.   Train GraphSage
2.   Train GCN


In [None]:
%%time
# Apply Learning Rate scheduling

import torch.optim.lr_scheduler as lr_scheduler ## Section 8.5.2
from torch.optim.lr_scheduler import ReduceLROnPlateau


This block focuses on the training of the GraphSAGE model. It includes the implementation of the training function, which also tracks GPU memory usage at different stages of the training process to monitor the model's performance and resource utilization. The section also involves the testing function to evaluate the model on the validation set.


In [None]:
# Initialize hyperparameters
lr, epochs, hidden_dim = 0.001, 2, 90

# Instantiate evaluator for the 'ogbn-products' dataset
evaluator = Evaluator(name='ogbn-products')

# Instantiate and move the GraphSAGE_dense model to the appropriate device
model_base_dense = GraphSAGE_dense(in_dim=data_dense.num_node_features,
                                   hidden_dim=hidden_dim,
                                   out_dim=dataset_dense.num_classes).to(device)

# Initialize Adam optimizer with the model’s parameters
optimizer = torch.optim.Adam(model_base_dense.parameters(), lr=lr)

# Create a learning rate scheduler to adjust the learning rate based on performance
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.1)


In [None]:
import pynvml
import torch
import torch.nn.functional as F
from thop import profile
from sklearn.metrics import confusion_matrix, classification_report

# Initialize NVML library
pynvml.nvmlInit()

# Function to get the current GPU memory usage
def get_gpu_memory_usage():
    """
    Get the current GPU memory usage.

    This function retrieves the current memory usage of the GPU by utilizing the NVML library.
    The memory usage is returned in megabytes.

    Returns:
    float: The current GPU memory usage in MB.
    """

    handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # 0 for the first GPU
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return info.used / (1024 * 1024)  # Convert bytes to MB

# Training function with memory tracking, FLOPs, and epoch time
def train_dense(model, batch, optimizer):
    """
    Train a model on a single batch of data, while tracking GPU memory usage, FLOPs, and epoch time.

    Parameters:
    - model (torch.nn.Module): The model to be trained.
    - batch (torch.Tensor): The batch of data for training.
    - optimizer (torch.optim.Optimizer): The optimizer for updating model parameters.

    Returns:
    tuple: Contains the loss, the number of correctly classified samples, a dictionary
           of memory usage after each operation, max memory usage, the operation that
           caused max memory usage, and the time taken for the epoch.
    """

    model.train()
    batch = batch.to('cuda')

    memory_tracking = {}  # Dictionary to store memory usage after each operation
    max_memory = 0        # To store max memory usage
    max_memory_step = ""  # To store the operation that caused max memory usage


    start_time = time.time()

    optimizer.zero_grad()
    memory_tracking['after optimizer.zero_grad()'] = get_gpu_memory_usage()
    if memory_tracking['after optimizer.zero_grad()'] > max_memory:
        max_memory = memory_tracking['after optimizer.zero_grad()']
        max_memory_step = 'after optimizer.zero_grad()'

    out = model(batch.x, batch.edge_index)
    memory_tracking['after model forward pass'] = get_gpu_memory_usage()
    if memory_tracking['after model forward pass'] > max_memory:
        max_memory = memory_tracking['after model forward pass']
        max_memory_step = 'after model forward pass'

    loss = torch.nn.functional.cross_entropy(out, batch.y.squeeze(1).long())
    memory_tracking['after loss calculation'] = get_gpu_memory_usage()
    if memory_tracking['after loss calculation'] > max_memory:
        max_memory = memory_tracking['after loss calculation']
        max_memory_step = 'after loss calculation'

    loss.backward()
    memory_tracking['after loss.backward()'] = get_gpu_memory_usage()
    if memory_tracking['after loss.backward()'] > max_memory:
        max_memory = memory_tracking['after loss.backward()']
        max_memory_step = 'after loss.backward()'

    optimizer.step()
    memory_tracking['after optimizer.step()'] = get_gpu_memory_usage()
    if memory_tracking['after optimizer.step()'] > max_memory:
        max_memory = memory_tracking['after optimizer.step()']
        max_memory_step = 'after optimizer.step()'

    end_time = time.time()
    epoch_time = end_time - start_time


    correct = (out.argmax(dim=1) == batch.y.squeeze(1)).sum().item()

    # Return the collected data, including memory usage
    return loss.item(), correct, memory_tracking, max_memory, max_memory_step, epoch_time


# Testing function with memory tracking
@torch.no_grad()
def test_dense(model, batch):
    """
    Test a model on a single batch of data, while tracking GPU memory usage.

    This function evaluates the model's performance and memory usage on a test batch of data.
    Memory usage at different steps of the testing process is tracked and returned.

    Parameters:
    - model (torch.nn.Module): The model to be tested.
    - batch (torch.Tensor): The batch of data for testing.

    Returns:
    tuple: Contains the predictions, true labels, a dictionary of memory usage after each
           operation, max memory usage, and the operation that caused max memory usage.
    """


    model.eval()
    batch = batch.to('cuda')

    memory_tracking = {}
    max_memory = 0
    max_memory_step = ""

    torch.cuda.synchronize()  # Ensure all CUDA operations are completed
    out = model(batch.x, batch.edge_index)

    torch.cuda.synchronize()  # Ensure all CUDA operations are completed
    memory_tracking['after model forward pass'] = get_gpu_memory_usage()
    if memory_tracking['after model forward pass'] > max_memory:
        max_memory = memory_tracking['after model forward pass']
        max_memory_step = 'after model forward pass'

    pred = out.argmax(dim=1)

    torch.cuda.synchronize()  # Ensure all CUDA operations are completed
    memory_tracking['after prediction'] = get_gpu_memory_usage()
    if memory_tracking['after prediction'] > max_memory:
        max_memory = memory_tracking['after prediction']
        max_memory_step = 'after prediction'

    # print(memory_tracking)  # Print raw memory values for debugging

    return pred.cpu(), batch.y.cpu(), memory_tracking, max_memory, max_memory_step



1. The code is a training loop for training a deep learning model for a total of 100 epochs.

2.  For each epoch:

-    It trains the model on batches of data obtained from a data loader (loader).
-    Tracks GPU memory usage and epoch time during training.
-    Computes the average and maximum training memory usage.
-    Computes the total time taken for each epoch.
-    Collects loss values for each batch.
Computes the average loss over all batches.
3.  After training in each epoch:

-    It evaluates the model on a validation set (valid_loader), tracking the GPU memory usage.
-    Computes the average and maximum memory usage during testing.
-    Collects all predictions and true labels.

4.  It then computes and prints the confusion matrix and classification report for the validation set, offering insights into the model's performance.

5. Finally, it prints a summary including:

-    The total time taken for the epoch.
-    The average and maximum memory usage during training and testing.
-    The average loss during training.

6.  This loop repeats for each epoch, providing detailed insights into the training process, memory consumption, and model performance on the validation set.

In [None]:

# Epoch loop
epochs = 100
for epoch in range(1, epochs + 1):
    total_epoch_time = 0
    # total_flops = 0
    train_memory = []
    max_train_memory = 0
    loss_values = []

    for batch in loader:  # assuming loader is your DataLoader object
        loss, _, memory_tracking, max_memory, _, epoch_time = train_dense(model_base_dense, batch, optimizer)
        train_memory.append(sum(memory_tracking.values()) / len(memory_tracking))
        max_train_memory = max(max_train_memory, max(memory_tracking.values()))
        total_epoch_time += epoch_time
        # total_flops += int(flops)
        loss_values.append(loss)

    avg_train_memory = sum(train_memory) / len(train_memory) if train_memory else 0
    avg_loss = sum(loss_values) / len(loss_values) if loss_values else 0

    test_memory = []
    max_test_memory = 0

    all_preds = []
    all_labels = []


    for batch in valid_loader:  # assuming valid_loader is your DataLoader object for validation set
        preds, labels, memory_tracking, max_memory, _ = test_dense(model_base_dense, batch)
        test_memory.append(sum(memory_tracking.values()) / len(memory_tracking))
        max_test_memory = max(max_test_memory, max(memory_tracking.values()))
        all_preds.append(preds)
        all_labels.append(labels)

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    avg_test_memory = sum(test_memory) / len(test_memory) if test_memory else 0

    # Compute the confusion matrix and classification report
    cm = confusion_matrix(all_labels, all_preds)
    cr = classification_report(all_labels, all_preds)

    print('Confusion Matrix:\n', cm)
    print('Classification Report:\n', cr)


    print(f"Epoch {epoch} Summary:")
    print(f"Total Epoch Time: {total_epoch_time:.2f} seconds")
    # print(f"Total FLOPs: {total_flops}")
    print(f"Avg Train Memory Usage: {avg_train_memory:.2f} MB")
    print(f"Max Train Memory Usage: {max_train_memory:.2f} MB")
    print(f"Avg Test Memory Usage: {avg_test_memory:.2f} MB")
    print(f"Max Test Memory Usage: {max_test_memory:.2f} MB")
    print(f"Avg Loss: {avg_loss:.4f}")
    print(f"{'='*50}")
