In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import psutil

# Create a simple CNN architecture
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Generate a random dataset
num_samples = 6400
num_features = 1
height = 28
width = 28
images = torch.randn(num_samples, num_features, height, width)
labels = torch.randint(0, 2, (num_samples,))

def train_without_gradient_accumulation():
    # Initialize the model, loss function, and optimizer
    model = SimpleCNN()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    # Training parameters
    num_epochs = 10
    batch_size = 64

    # Training loop without gradient accumulation
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i in range(0, num_samples, batch_size):
            inputs = images[i:i+batch_size]
            target = labels[i:i+batch_size]

            # Forward pass
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, target)

            # Backward pass and update
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Print the average loss for this epoch
        print(f'Epoch {epoch + 1}, Loss: {running_loss / (num_samples // batch_size)}')

def train_with_gradient_accumulation():
    # Initialize the model, loss function, and optimizer
    model = SimpleCNN()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    # Training parameters
    num_epochs = 10
    desired_batch_size = 256
    sub_batch_size = 64
    accumulation_steps = desired_batch_size // sub_batch_size

    # Training loop with gradient accumulation
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i in range(0, num_samples, desired_batch_size):
            optimizer.zero_grad()

            for j in range(i, i + desired_batch_size, sub_batch_size):
                inputs = images[j:j+sub_batch_size]
                target = labels[j:j+sub_batch_size]

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, target)

                # Backward pass with gradient accumulation
                loss.backward()
                running_loss += loss.item()

            # Update the model parameters after accumulating gradients
            optimizer.step()

        # Print the average loss for this epoch
        print(f'Epoch {epoch + 1}, Loss: {running_loss / (num_samples // desired_batch_size)}')


# Function to calculate the average CPU usage during training
def measure_cpu_usage(func):
    def wrapper(*args, **kwargs):
        cpu_percentages = []
        start_time = time.time()
        for _ in range(10):  # Measure CPU usage 10 times per second
            cpu_percentages.append(psutil.cpu_percent())
            time.sleep(0.1)
        avg_cpu_usage = sum(cpu_percentages) / len(cpu_percentages)
        print(f'Average CPU usage during training: {avg_cpu_usage}%')
        func(*args, **kwargs)
        end_time = time.time()
        print(f'Training time: {end_time - start_time} seconds')
    return wrapper

In [2]:
# Wrap the training functions with the measure_cpu_usage decorator
train_without_gradient_accumulation = measure_cpu_usage(train_without_gradient_accumulation)
train_with_gradient_accumulation = measure_cpu_usage(train_with_gradient_accumulation)

# Run the training loops
print('Training without gradient accumulation:')
train_without_gradient_accumulation()
print('\nTraining with gradient accumulation:')
train_with_gradient_accumulation()

Training without gradient accumulation:
Average CPU usage during training: 25.82%
Epoch 1, Loss: 0.6937218672037124
Epoch 2, Loss: 0.6932226049900055
Epoch 3, Loss: 0.693155956864357
Epoch 4, Loss: 0.6931175756454467
Epoch 5, Loss: 0.693082891702652
Epoch 6, Loss: 0.6930537974834442
Epoch 7, Loss: 0.6930223977565766
Epoch 8, Loss: 0.6929922187328339
Epoch 9, Loss: 0.6929661071300507
Epoch 10, Loss: 0.6929388135671616
Training time: 8.093447923660278 seconds

Training with gradient accumulation:
Average CPU usage during training: 38.9%
Epoch 1, Loss: 2.7724686956405638
Epoch 2, Loss: 2.7722557759284974
Epoch 3, Loss: 2.772132396697998
Epoch 4, Loss: 2.7720070576667784
Epoch 5, Loss: 2.7718866276741028
Epoch 6, Loss: 2.7717677211761473
Epoch 7, Loss: 2.771643841266632
Epoch 8, Loss: 2.7715102553367617
Epoch 9, Loss: 2.7713914942741393
Epoch 10, Loss: 2.771271359920502
Training time: 8.04003095626831 seconds
