## Toy model to evaluate efficiencies of multi-GPU methods
23 Jan. 2025

---

#### Toy Model (ReLU)...

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
import torch.distributed as dist
import time

from core.utils import set_randomseed
set_randomseed()


# Define a simple toy model
class ToyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ToyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Check if GPUs are available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


#### 1. Train vanilla...


In [17]:
def train_vanilla(dataloader, main_device, epochs, input_size, output_size, hidden_size, lr):
    print("\n=== Single-GPU Training ===")
    model = ToyModel(input_size, hidden_size, output_size).to(main_device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    t0 = time.time()
    t_epoch = 0.0  # avg time per epoch
    t_back = 0.0   # avg time per backward

    for epoch in range(epochs):
        total_loss = 0.0
        t_ep0 = time.time()

        for inputs, labels in dataloader:
            # Move data to main device
            inputs, labels = inputs.to(main_device), labels.to(main_device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            t_back0 = time.time()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t_back1 = time.time()

            total_loss += loss.item()
            t_back += t_back1 - t_back0
        
        t_ep1 = time.time()
        t_epoch += t_ep1 - t_ep0
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")
    
    # Total runtime
    t1 = time.time()
    t_total = t1 - t0

    # Average time per epoch
    t_epoch /= epochs

    # Average time per backward
    t_back /= epochs * len(dataloader)

    print(f"------ \n Training: {t_total:.4f} seconds.")
    print(f"------ \n Average time per epoch: {t_epoch:.4f} seconds.")
    print(f"------ \n Average time per backward: {t_back:.4f} seconds.")
    print("====================================\n")

#### 2. Train with DataParallel...

- Splits the input (*batch*) across multiple GPUs
    - Module is replicated on each device to handle a piece of the input
- Executes forward/backward pass on each GPU independently
- Aggregate grads on primary GPU before updating params

In [18]:
def train_dp(dataloader, main_device, device_ids, epochs, input_size, output_size, hidden_size, lr):
    print("\n=== DataParallel Training ===")
    model = ToyModel(input_size, hidden_size, output_size).to(main_device)

    # Wrap the model with DataParallel
    if torch.cuda.device_count() > 1:
        print(f"Using {len(device_ids)} GPUs with DataParallel.")
        model = nn.DataParallel(
            model,
            output_device=main_device,
            device_ids=device_ids
        )
    else:
        raise ValueError("DataParallel requires more than 1 GPU.")
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    t0 = time.time()
    t_epoch = 0.0  # avg time per epoch
    t_back = 0.0   # avg time per backward

    for epoch in range(epochs):
        total_loss = 0.0
        t_ep0 = time.time()

        for inputs, labels in dataloader:
            # Move data to main device
            inputs, labels = inputs.to(main_device), labels.to(main_device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            t_back0 = time.time()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t_back1 = time.time()

            total_loss += loss.item()
            t_back += t_back1 - t_back0
        
        t_ep1 = time.time()
        t_epoch += t_ep1 - t_ep0
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")
    
    # Total runtime
    t1 = time.time()
    t_total = t1 - t0

    # Average time per epoch
    t_epoch /= epochs

    # Average time per backward
    t_back /= epochs * len(dataloader)

    print(f"------ \n Training: {t_total:.4f} seconds.")
    print(f"------ \n Average time per epoch: {t_epoch:.4f} seconds.")
    print(f"------ \n Average time per backward: {t_back:.4f} seconds.")
    print("====================================\n")

### 3. Train with DistributedDataParallel...

In [35]:
! python train_ddp.py


=== DistributedDataParallel Training ===
W0125 13:25:25.826000 3572048 torch/multiprocessing/spawn.py:160] Terminating process 3572223 via signal SIGTERM
Traceback (most recent call last):
  File "/projects/mhpi/lglonz/project_silmaril/generic_deltaModel/deltaModel/train_ddp.py", line 172, in <module>
    mp.spawn(
  File "/projects/mhpi/lglonz/project_silmaril/generic_deltaModel/venv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 328, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/projects/mhpi/lglonz/project_silmaril/generic_deltaModel/venv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 284, in start_processes
    while not context.join():
              ^^^^^^^^^^^^^^
  File "/projects/mhpi/lglonz/project_silmaril/generic_deltaModel/venv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 203, in j

### Compare training variations

---

#### Prepare synthetic data and hyperparams

In [45]:
torch.cuda.empty_cache()


# Dataset parameters
input_size = 1000
output_size = 2
num_samples = 10000
batch_size = 10000


# Hyperparameters
device_ids = [6,7]
main_device = 6
epochs = 2
hidden_size = 256
lr = 0.001


### Synthetic dataset
# Random input features and labels
x = torch.rand(num_samples, input_size)
y = torch.randint(0, output_size, (num_samples,))

# Create DataLoader
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


#### Run training variants...

In [46]:
train_vanilla(dataloader, main_device, epochs, input_size, output_size, hidden_size, lr)

train_dp(dataloader, main_device, device_ids, epochs, input_size, output_size, hidden_size, lr)


=== Single-GPU Training ===
Epoch [1/2], Loss: 0.6999
Epoch [2/2], Loss: 1.3098
------ 
 Training: 0.2042 seconds.
------ 
 Average time per epoch: 0.1020 seconds.
------ 
 Average time per backward: 0.0014 seconds.


=== DataParallel Training ===
Using 2 GPUs with DataParallel.
Epoch [1/2], Loss: 0.6996
Epoch [2/2], Loss: 1.3484
------ 
 Training: 0.2063 seconds.
------ 
 Average time per epoch: 0.1031 seconds.
------ 
 Average time per backward: 0.0031 seconds.

