In [1]:
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda import amp

In [8]:
nodes=1
gpus=2
nr=0
master_addr='127.0.0.1'
master_port='10000'

batch_size=256
epochs=20

world_size = gpus * nodes


In [9]:
os.environ['MASTER_ADDR']=master_addr
os.environ['MASTER_PORT']=master_port

In [10]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [11]:
def train(gpu, nr,gpus,world_size,epochs,batch_size):
    rank = nr * gpus + gpu
    dist.init_process_group(backend='nccl', init_method='env://', 
                            world_size=world_size, rank=rank,
                            )
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    # Data loading code
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                    num_replicas=world_size,
                                                                    rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)
    print("Starting the training loop")
    for epoch in range(epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, epochs, i + 1, total_step,

        loss.item()))
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))


In [12]:
pool = mp.Pool(processes=world_size)

Starting the training loopStarting the training loop

Epoch [1/20], Step [100/118], Loss: 2.1546
Epoch [2/20], Step [100/118], Loss: 1.9839
Epoch [3/20], Step [100/118], Loss: 1.8348
Epoch [4/20], Step [100/118], Loss: 1.7014
Epoch [5/20], Step [100/118], Loss: 1.5823
Epoch [6/20], Step [100/118], Loss: 1.4763
Epoch [7/20], Step [100/118], Loss: 1.3819
Epoch [8/20], Step [100/118], Loss: 1.2979
Epoch [9/20], Step [100/118], Loss: 1.2230
Epoch [10/20], Step [100/118], Loss: 1.1559
Epoch [11/20], Step [100/118], Loss: 1.0958
Epoch [12/20], Step [100/118], Loss: 1.0415
Epoch [13/20], Step [100/118], Loss: 0.9925
Epoch [14/20], Step [100/118], Loss: 0.9480
Epoch [15/20], Step [100/118], Loss: 0.9075
Epoch [16/20], Step [100/118], Loss: 0.8705
Epoch [17/20], Step [100/118], Loss: 0.8367
Epoch [18/20], Step [100/118], Loss: 0.8056
Epoch [19/20], Step [100/118], Loss: 0.7771
Epoch [20/20], Step [100/118], Loss: 0.7507
Training complete in: 0:00:50.690316


In [13]:
result = pool.starmap(train,[(gpu,nr,gpus,world_size,epochs,batch_size) for gpu in range(world_size)])