# DDP with Model Parallelism 

In [5]:
%%writefile main.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import numpy as np


import copy
import random
import time
import os
import json

from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp

SEED = 1234
ROOT = "."
MODEL_NAME = "MLP"
SENARIO = "2GPU"
EPOCHS = 10
BATCH_SIZE = 512

outdir = "./my_datasets"  
os.makedirs(outdir, exist_ok=True)
os.environ['HF_DATASETS_CACHE'] = outdir


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

"""# 2. Initialize the DDP Environment"""

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'  # Change this to the master node's IP address if using multiple machines
    os.environ['MASTER_PORT'] = '12345'  # Pick a free port on the master node
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

"""# 3. Define a Model."""

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim, dev0, dev1):
        super().__init__()
        self.dev0 = dev0
        self.dev1 = dev1

        self.input_fc = nn.Linear(input_dim, 250).to(dev0)
        self.hidden_fc = nn.Linear(250, 100).to(dev1)
        self.output_fc = nn.Linear(100, output_dim).to(dev1)

    def forward(self, x):

        # x = [batch size, height, width]

        batch_size = x.shape[0]

        x = x.view(batch_size, -1)
        x = x.to(self.dev0)

        # x = [batch size, height * width]

        h_1 = F.relu(self.input_fc(x))
        h_1 = h_1.to(self.dev1)

        # h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))

        # h_2 = [batch size, 100]

        y_pred = self.output_fc(h_2)

        # y_pred = [batch size, output dim]

        return y_pred, h_2



def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_model(dev0, dev1):
    INPUT_DIM = 28 * 28
    OUTPUT_DIM = 10

    model = MLP(INPUT_DIM, OUTPUT_DIM, dev0, dev1)
    print(f'The model has {count_parameters(model):,} trainable parameters')

    return model

"""# 4. Create a Dummy Dataset"""

def create_dataloader(rank, world_size, batch_size=BATCH_SIZE, root = ROOT, max_length = 256):
    mean = 0.13066047430038452
    std = 0.30810779333114624

    train_transforms = transforms.Compose([
                            transforms.RandomRotation(5, fill=(0,)),
                            transforms.RandomCrop(28, padding=2),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[mean], std=[std])
                                      ])

    test_transforms = transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize(mean=[mean], std=[std])
                                     ])

    ## load the data with
    outdir = f"{root}/data"
    if rank == 0 and not os.path.exists(outdir):
        train_data = datasets.MNIST(root=outdir,
                                    train=True,
                                    download=True,
                                    transform=train_transforms)

        test_data = datasets.MNIST(root=outdir,
                                  train=False,
                                  download=True,
                                  transform=test_transforms)

    dist.barrier()  # Ensure all processes wait for the dataset to be downloaded
     
    train_data = datasets.MNIST(root=outdir,
                                train=True,
                                download=True,
                                transform=train_transforms)

    test_data = datasets.MNIST(root=outdir,
                              train=False,
                              download=True,
                              transform=test_transforms)
    

    ## create the validation split
    VALID_RATIO = 0.9

    n_train_examples = int(len(train_data) * VALID_RATIO)
    n_valid_examples = len(train_data) - n_train_examples
    train_data, valid_data = data.random_split(train_data,
                                           [n_train_examples, n_valid_examples])

    if rank == 0:
        print(f'Number of training examples: {len(train_data)}')
        print(f'Number of validation examples: {len(valid_data)}')
        print(f'Number of testing examples: {len(test_data)}')


    ## Creating Data Loaders
    
    train_sampler = DistributedSampler(train_data, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler = DistributedSampler(valid_data, num_replicas=world_size, rank=rank)

    train_dataloader = data.DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, pin_memory=True) #use num_workers > 0 for better performance
    val_dataloader = data.DataLoader(valid_data, batch_size=batch_size, sampler=val_sampler, pin_memory=True) #use num_workers > 0 for better performance
    test_dataloader = data.DataLoader(test_data, batch_size=batch_size, shuffle=False, pin_memory=True) #no sampling for test dataset
    return train_dataloader, val_dataloader, test_dataloader

"""# 5. Implement the Training Loop

## a. Help function
"""

RESULTS_FILE = f"{ROOT}/{MODEL_NAME}_{EPOCHS}epochs_{SENARIO}.json"

def log_results(scenario, results):
    """
    Save results to a JSON file for comparison across scenarios.
    """
    if os.path.exists(RESULTS_FILE):
        with open(RESULTS_FILE, 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    all_results[scenario] = results

    with open(RESULTS_FILE, 'w') as f:
        json.dump(all_results, f, indent=4)

def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

"""## b. train function"""
def train(model, iterator, optimizer, criterion, dev0, dev1):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    i=0
    for (x, y) in tqdm(iterator, desc=f"Training on the dev0 {dev0} & dev1 {dev1}...", leave=False):

        x = x.to(dev0)
        y = y.to(dev1)

        optimizer.zero_grad()

        y_pred, _ = model(x)

        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        if i % 50 == 0 and dev0*dev1 == 0 :
            print(f"- On Training: {i} was passed over  {len(iterator)}")
        i+=1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)
 

"""## c. Validation function"""
def evaluate(model, iterator, criterion, dev0, dev1, mode = "Evaluating"):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    i=0
    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc=f"{mode} on the dev0 {dev0} & dev1 {dev1}...", leave=False):

            x = x.to(dev0)
            y = y.to(dev1)

            y_pred, _ = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            if i % 50 == 0 and dev0*dev1 == 0:
                print(f"- On {mode}: {i} was passed over  {len(iterator)}")
            i+=1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


"""## d. Main loop"""

outdir = f'{ROOT}/model/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

def main_train(rank, world_size, root = outdir, num_epochs = EPOCHS, model_name = MODEL_NAME):
    ## a. Set up the distributed process groups
    setup(rank, world_size)
    print(f"Process {rank} initialized.")

    # setup mp_model and devices for this process
    dev0 = rank * 2
    dev1 = rank * 2 + 1

    ## b. Create Model, DataLoader
    train_dataloader, val_dataloader, test_dataloader = create_dataloader(rank, world_size)
    model = create_model(dev0, dev1)

    ## c. Wrap the model with DistributedDataParallel
    ddp_model = DDP(model)

    ## d. Loss and Optimizer
    #LR = 5e-4
    criterion = nn.CrossEntropyLoss() # Move loss to GPU
    optimizer = optim.Adam(ddp_model.parameters())

    ## e. Training Loop
    best_valid_loss = float('inf')
    training_times = []
    train_losses = []
    train_accurcy = []
    validation_times = []
    validation_losses = []
    validation_accurcy = []

    epoch_times = []
    
    for epoch in trange(num_epochs, desc="Epochs"):
        start_epoch_time = time.monotonic()
        start_time = time.monotonic()

        train_loss, train_acc = train(ddp_model, train_dataloader, optimizer, criterion, dev0, dev1)
        train_time = time.monotonic() - start_time
        training_times.append(train_time)
        train_losses.append(train_loss)
        train_accurcy.append(train_acc)

        start_time = time.monotonic()
        valid_loss, valid_acc = evaluate(ddp_model, val_dataloader, criterion, dev0, dev1)
        val_time = time.monotonic() - start_time
        validation_times.append(val_time)
        validation_losses.append(valid_loss)
        validation_accurcy.append(valid_acc)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(ddp_model.state_dict(), f'{root}mlp-model.pt')

        end_time = time.monotonic()
        e_time = end_time - start_epoch_time
        epoch_times.append(e_time)
        epoch_mins, epoch_secs = epoch_time(start_epoch_time, end_time)

        print(f'--------------|     On process {rank}      |----------------')
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    ## f. test after train
    ddp_model.load_state_dict(torch.load(f'{root}mlp-model.pt'))
    start_time = time.monotonic()
    test_loss, test_acc = evaluate(ddp_model, test_dataloader, criterion, dev0, dev1, mode = "Testing")
    test_time = time.monotonic() - start_time
    print(f'Test results on process {rank}: Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

    # Log results
    results = {
        "world_size": world_size,
        "rank": rank,
        "training_times": training_times,
        "train_losses": train_losses,
        "train_accurcy": train_accurcy,
        "validation_times": validation_times,
        "validation_losses": validation_losses,
        "validation_accurcy": validation_accurcy,
        "test_time": test_time,
        "test_loss": test_loss,
        "test_acc": test_acc,
        "epoch_times": epoch_times
     }

    scenario = f"model_{model_name}_epochs_{num_epochs}_{world_size}_GPUs_dev0_{dev0}_dev1_{dev1}_rank_{0}"
    log_results(scenario, results)
    dist.barrier()
    
    cleanup()
    print(f'Process {rank} finished training.')

"""# 6. Main Execution"""
if __name__ == "__main__":

    def main():
        world_size = torch.cuda.device_count()
        print(f'Total number of devices detected: {world_size}')

        if world_size >= 1:
            #start the training process on all available GPUs
            world_size = world_size//2
            if world_size > 1:
                #start the training process on all available GPUs
                
                mp.spawn(
                    main_train,
                    args=(world_size,),
                    nprocs=world_size,
                    join=True
                )
            else:
                #run training on single GPU
                main_train(rank=0, world_size=world_size)

        else:
            print('no GPUs found. Please make sure you have configured CUDA correctly')

    main()

Overwriting main.py


In [6]:
!python main.py

Total number of devices detected: 2
Process 0 initialized.
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz
100%|███████████████████████████| 9912422/9912422 [00:00<00:00, 43517733.63it/s]
Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz
100%|████████████████████████████████| 28881/28881 [00:00<00:00, 1146749.09it/s]
Ext

# DDP

In [7]:
%%writefile main.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import numpy as np


import copy
import random
import time
import os
import json

from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp

SEED = 1234
ROOT = "."
MODEL_NAME = "MLP"
SENARIO = "2GPU_DDP"
EPOCHS = 10
BATCH_SIZE = 512

outdir = "./my_datasets"  
os.makedirs(outdir, exist_ok=True)
os.environ['HF_DATASETS_CACHE'] = outdir


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

"""# 2. Initialize the DDP Environment"""

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'  # Change this to the master node's IP address if using multiple machines
    os.environ['MASTER_PORT'] = '12345'  # Pick a free port on the master node
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

"""# 3. Define a Model."""

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.input_fc = nn.Linear(input_dim, 250)
        self.hidden_fc = nn.Linear(250, 100)
        self.output_fc = nn.Linear(100, output_dim)

    def forward(self, x):

        # x = [batch size, height, width]

        batch_size = x.shape[0]

        x = x.view(batch_size, -1)

        # x = [batch size, height * width]

        h_1 = F.relu(self.input_fc(x))

        # h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))

        # h_2 = [batch size, 100]

        y_pred = self.output_fc(h_2)

        # y_pred = [batch size, output dim]

        return y_pred, h_2



def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_model():
    INPUT_DIM = 28 * 28
    OUTPUT_DIM = 10

    model = MLP(INPUT_DIM, OUTPUT_DIM)
    print(f'The model has {count_parameters(model):,} trainable parameters')

    return model

"""# 4. Create a Dummy Dataset"""

def create_dataloader(rank, world_size, batch_size=BATCH_SIZE, root = ROOT, max_length = 256):
    mean = 0.13066047430038452
    std = 0.30810779333114624

    train_transforms = transforms.Compose([
                            transforms.RandomRotation(5, fill=(0,)),
                            transforms.RandomCrop(28, padding=2),
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[mean], std=[std])
                                      ])

    test_transforms = transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize(mean=[mean], std=[std])
                                     ])

    ## load the data with
    outdir = f"{root}/data"
    if rank == 0 and not os.path.exists(outdir):
        train_data = datasets.MNIST(root=outdir,
                                    train=True,
                                    download=True,
                                    transform=train_transforms)

        test_data = datasets.MNIST(root=outdir,
                                  train=False,
                                  download=True,
                                  transform=test_transforms)

    dist.barrier()  # Ensure all processes wait for the dataset to be downloaded
     
    train_data = datasets.MNIST(root=outdir,
                                train=True,
                                download=True,
                                transform=train_transforms)

    test_data = datasets.MNIST(root=outdir,
                              train=False,
                              download=True,
                              transform=test_transforms)
    

    ## create the validation split
    VALID_RATIO = 0.9

    n_train_examples = int(len(train_data) * VALID_RATIO)
    n_valid_examples = len(train_data) - n_train_examples
    train_data, valid_data = data.random_split(train_data,
                                           [n_train_examples, n_valid_examples])

    if rank == 0:
        print(f'Number of training examples: {len(train_data)}')
        print(f'Number of validation examples: {len(valid_data)}')
        print(f'Number of testing examples: {len(test_data)}')


    ## Creating Data Loaders
    
    train_sampler = DistributedSampler(train_data, num_replicas=world_size, rank=rank, shuffle=True)
    val_sampler = DistributedSampler(valid_data, num_replicas=world_size, rank=rank)

    train_dataloader = data.DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, pin_memory=True) #use num_workers > 0 for better performance
    val_dataloader = data.DataLoader(valid_data, batch_size=batch_size, sampler=val_sampler, pin_memory=True) #use num_workers > 0 for better performance
    test_dataloader = data.DataLoader(test_data, batch_size=batch_size, shuffle=False, pin_memory=True) #no sampling for test dataset
    return train_dataloader, val_dataloader, test_dataloader

"""# 5. Implement the Training Loop

## a. Help function
"""

RESULTS_FILE = f"{ROOT}/{MODEL_NAME}_{EPOCHS}epochs_{SENARIO}.json"

def log_results(scenario, results):
    """
    Save results to a JSON file for comparison across scenarios.
    """
    if os.path.exists(RESULTS_FILE):
        with open(RESULTS_FILE, 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    all_results[scenario] = results

    with open(RESULTS_FILE, 'w') as f:
        json.dump(all_results, f, indent=4)

def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

"""## b. train function"""
def train(model, iterator, optimizer, criterion, rank):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    i=0
    for (x, y) in tqdm(iterator, desc=f"Training on the rank {rank}...", leave=False):

        x = x.to(rank)
        y = y.to(rank)

        optimizer.zero_grad()

        y_pred, _ = model(x)

        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        if i % 50 == 0 and rank == 0 :
            print(f"- On Training: {i} was passed over  {len(iterator)}")
        i+=1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)
 

"""## c. Validation function"""
def evaluate(model, iterator, criterion, rank, mode = "Evaluating"):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    i=0
    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc=f"{mode} on the rank {rank} ...", leave=False):

            x = x.to(rank)
            y = y.to(rank)

            y_pred, _ = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            if i % 50 == 0 and rank == 0:
                print(f"- On {mode}: {i} was passed over  {len(iterator)}")
            i+=1

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


"""## d. Main loop"""

outdir = f'{ROOT}/model/'
if not os.path.exists(outdir):
    os.makedirs(outdir)

def main_train(rank, world_size, root = outdir, num_epochs = EPOCHS, model_name = MODEL_NAME):
    ## a. Set up the distributed process groups
    setup(rank, world_size)
    print(f"Process {rank} initialized.")

    # setup mp_model and devices for this process
    

    ## b. Create Model, DataLoader
    train_dataloader, val_dataloader, test_dataloader = create_dataloader(rank, world_size)
    model = create_model().to(rank)

    ## c. Wrap the model with DistributedDataParallel
    ddp_model = DDP(model, device_ids=[rank])

    ## d. Loss and Optimizer
    #LR = 5e-4
    criterion = nn.CrossEntropyLoss().to(rank) # Move loss to GPU
    optimizer = optim.Adam(ddp_model.parameters())

    ## e. Training Loop
    best_valid_loss = float('inf')
    training_times = []
    train_losses = []
    train_accurcy = []
    validation_times = []
    validation_losses = []
    validation_accurcy = []

    epoch_times = []
    
    for epoch in trange(num_epochs, desc="Epochs"):
        start_epoch_time = time.monotonic()
        start_time = time.monotonic()

        train_loss, train_acc = train(ddp_model, train_dataloader, optimizer, criterion, rank)
        train_time = time.monotonic() - start_time
        training_times.append(train_time)
        train_losses.append(train_loss)
        train_accurcy.append(train_acc)

        start_time = time.monotonic()
        valid_loss, valid_acc = evaluate(ddp_model, val_dataloader, criterion, rank)
        val_time = time.monotonic() - start_time
        validation_times.append(val_time)
        validation_losses.append(valid_loss)
        validation_accurcy.append(valid_acc)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(ddp_model.state_dict(), f'{root}mlp-model.pt')

        end_time = time.monotonic()
        e_time = end_time - start_epoch_time
        epoch_times.append(e_time)
        epoch_mins, epoch_secs = epoch_time(start_epoch_time, end_time)

        print(f'--------------|     On process {rank}      |----------------')
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    ## f. test after train
    ddp_model.load_state_dict(torch.load(f'{root}mlp-model.pt'))
    start_time = time.monotonic()
    test_loss, test_acc = evaluate(ddp_model, test_dataloader, criterion, rank, mode = "Testing")
    test_time = time.monotonic() - start_time
    print(f'Test results on process {rank}: Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

    # Log results
    results = {
        "world_size": world_size,
        "rank": rank,
        "training_times": training_times,
        "train_losses": train_losses,
        "train_accurcy": train_accurcy,
        "validation_times": validation_times,
        "validation_losses": validation_losses,
        "validation_accurcy": validation_accurcy,
        "test_time": test_time,
        "test_loss": test_loss,
        "test_acc": test_acc,
        "epoch_times": epoch_times
     }

    scenario = f"model_{model_name}_epochs_{num_epochs}_{world_size}_GPUs_rank_{rank}"
    log_results(scenario, results)
    dist.barrier()
    
    cleanup()
    print(f'Process {rank} finished training.')

"""# 6. Main Execution"""
if __name__ == "__main__":

    def main():
        world_size = torch.cuda.device_count()
        print(f'Total number of devices detected: {world_size}')

        if world_size >= 1:
            #start the training process on all available GPUs
            
            if world_size > 1:
                #start the training process on all available GPUs
                
                mp.spawn(
                    main_train,
                    args=(world_size,),
                    nprocs=world_size,
                    join=True
                )
            else:
                #run training on single GPU
                main_train(rank=0, world_size=1)

        else:
            print('no GPUs found. Please make sure you have configured CUDA correctly')

    main()

Overwriting main.py


In [8]:
!python main.py

Total number of devices detected: 2
[W109 00:26:44.246411498 socket.cpp:697] [c10d] The client socket has failed to connect to [localhost]:12345 (errno: 99 - Cannot assign requested address).
Process 1 initialized.
Process 0 initialized.
Number of training examples: 54000
Number of validation examples: 6000
Number of testing examples: 10000
The model has 222,360 trainable parameters
The model has 222,360 trainable parameters
Epochs:   0%|          | 0/10 [00:00<?, ?it/s]
Epochs:   0%|          | 0/10 [00:00<?, ?it/s]
Training on the rank 0...:   0%|          | 0/53 [00:00<?, ?it/s]
Training on the rank 1...:   0%|          | 0/53 [00:00<?, ?it/s]
- On Training: 0 was passed over  53
- On Training: 50 was passed over  53
Evaluating on the rank 1 ...:   0%|          | 0/6 [00:00<?, ?it/s]
Evaluating on the rank 0 ...:   0%|          | 0/6 [00:00<?, ?it/s]
- On Evaluating: 0 was passed over  6
--------------|     On process 1      |----------------
Epoch: 01 | Epoch Time: 0m 12s
	Train Lo