In [1]:
import multiprocess

In [2]:
%matplotlib inline

In [3]:
import os
import sys
import tempfile
import torch

import torch.nn.functional as F

import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

import torch.backends.cudnn as cudnn
from torch.cuda import amp

In [4]:
# import and instantiate tensorboard for monitoring model performance
from torch.utils.tensorboard import SummaryWriter

In [5]:
import time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [6]:
nodes=1
gpus=2
num_workers=10

batch_size=256
epochs=2

torch.manual_seed(43)
cudnn.deterministic = True
cudnn.benchmark = False

In [7]:
def dataloader(gpu,world_size,batch_size,num_workers):
# Prepare training data
    train_transform = transforms.Compose(
    [transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ])


    val_transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ])
    #datadir=os.environ['DATA_DIR']
    datadir='/ibex/ai/home/shaima0d/tiny-imagenet-200'
    
    trainset = torchvision.datasets.ImageFolder(root=os.path.join(datadir,'train'),
                                                transform=train_transform)
    trainSampler = torch.utils.data.distributed.DistributedSampler(trainset,
                                                               num_replicas=world_size,
                                                               rank=gpu,
                                                               shuffle=False)
    trainloader = torch.utils.data.DataLoader(trainset, 
                                          batch_size=batch_size,
                                          shuffle=False, 
                                          num_workers=num_workers,
                                          pin_memory=False,
                                          sampler=trainSampler)
                                         

    valset = torchvision.datasets.ImageFolder(root=os.path.join(datadir,'val'),
                                              transform=val_transform)
    valSampler = torch.utils.data.distributed.DistributedSampler(valset,
                                                                  num_replicas=world_size,
                                                                  rank=gpu,shuffle=False)
    valloader = torch.utils.data.DataLoader(valset, 
                                             batch_size=batch_size,
                                             shuffle=False, 
                                             num_workers=num_workers,
                                             pin_memory=False,
                                             sampler=valSampler)
    return trainloader,valloader

Lets check the shape of the training dataloader

THe above shows that we have a total of 50,000 pictures of 10 classes in training dataset. 

Setting the batch_size=4 means we that our input will be 4 pictures i.e. 4*(3x32x32) pixels fed to our model at a time.
This implies that our training loop will do 50000/4 = 12500 trips across the PCIe bus. 

Let us show some of the training images

### 2. Define a Convolutional Neural Network
Copy the neural network from the Neural Networks section before and modify it to
take 3-channel images (instead of 1-channel images as it was defined).



In [8]:
net=torchvision.models.resnet50()

### 3. Define a Loss function and optimizer
Let's use a Classification Cross-Entropy loss and SGD with momentum.



### 4. Train the network

This is when things start to get interesting.
We simply have to loop over our data iterator, and feed the inputs to the
network and optimize.



In [9]:
def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

In [10]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [12]:
def train (net,gpus,world_size,rank,epochs,batch_size):
    gpu_id=rank
    print(gpu_id)
    
    net.cuda(gpu_id)
    net = nn.SyncBatchNorm.convert_sync_batchnorm(net)
    
    criterion = nn.CrossEntropyLoss().cuda(gpu_id)
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    
    local_rank = gpu_id #int(os.environ['LOCAL_RANK'])
    trainloader, valloader = dataloader(gpu_id,world_size,
                                        batch_size,
                                        num_workers)
    # Wrap model as DDP
    net = torch.nn.parallel.DistributedDataParallel(net,device_ids=[local_rank],
                                                   output_device=None, )
    start_timer()
    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        trainloader.sampler.set_epoch(epoch)
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data[0].cuda(gpu_id), data[1].cuda(gpu_id)
        
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if (i % 2) and (gpu_id == 0):    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

    end_timer_and_print('Finished Training')

In [13]:
def main(net,gpus,epochs,batch_size):
    world_size = 2
    setup(rank, world_size)
    train(net,gpus,world_size,rank,epochs,batch_size)
    return True

with mp.Pool(gpus) as pool:
    print(pool.starmap(train, [(i,nr,gpus,world_size,epochs,batch_size) for i in range(gpus)]))

#if __name__ == '__main__':
num_processes = 2
net = Net()
# NOTE: this is required for the ``fork`` method to work
#net.share_memory()
mp.set_start_method('forkserver')
processes = []
for rank in range(num_processes):
    p = mp.Process(target=train, args=(net,nr,gpus,world_size,epochs,batch_size))
    p.start()
    processes.append(p)
for p in processes:
    p.join()

In [14]:
import multiprocess as mp
num_processes = gpus
# NOTE: this is required for the ``fork`` method to work
net.share_memory()
processes = []
for rank in range(num_processes):
    p = mp.Process(target=main, args=(net,gpus,epochs,batch_size))
    p.start()
    processes.append(p)
for p in processes:
    p.join()

1
0




[1,     2] loss: 0.007
[1,     4] loss: 0.007
[1,     6] loss: 0.008
[1,     8] loss: 0.008
[1,    10] loss: 0.008
[1,    12] loss: 0.008
[1,    14] loss: 0.009
[1,    16] loss: 0.009
[1,    18] loss: 0.009
[1,    20] loss: 0.009
[1,    22] loss: 0.009
[1,    24] loss: 0.010
[1,    26] loss: 0.009
[1,    28] loss: 0.010
[1,    30] loss: 0.009
[1,    32] loss: 0.009
[1,    34] loss: 0.009
[1,    36] loss: 0.009
[1,    38] loss: 0.009
[1,    40] loss: 0.009
[1,    42] loss: 0.009
[1,    44] loss: 0.009
[1,    46] loss: 0.009
[1,    48] loss: 0.008
[1,    50] loss: 0.008
[1,    52] loss: 0.008
[1,    54] loss: 0.008
[1,    56] loss: 0.008
[1,    58] loss: 0.008
[1,    60] loss: 0.008
[1,    62] loss: 0.008
[1,    64] loss: 0.008
[1,    66] loss: 0.008
[1,    68] loss: 0.008
[1,    70] loss: 0.008
[1,    72] loss: 0.008
[1,    74] loss: 0.008
[1,    76] loss: 0.008
[1,    78] loss: 0.008
[1,    80] loss: 0.007
[1,    82] loss: 0.008
[1,    84] loss: 0.008
[1,    86] loss: 0.008
[1,    88] 