In [1]:
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda import amp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'1.12.1'

In [3]:
nodes=1
gpus=2
nr=0
master_addr='127.0.0.1'
master_port='9992'

batch_size=16
epochs=2

world_size = gpus * nodes


In [4]:
os.environ['MASTER_ADDR']=master_addr
os.environ['MASTER_PORT']=master_port

In [5]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [12]:
def train(gpu, nr,gpus,world_size,epochs,batch_size):
    rank = nr * gpus + gpu
    dist.init_process_group(backend='nccl', 
            #init_method="env://",
            world_size=world_size, rank=rank)
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    # Data loading code
    train_dataset = torchvision.datasets.MNIST(root='home/mohsin/hpc-saudi-2022/kubeflow-demo/data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                    num_replicas=world_size,
                                                                    rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)
    print("Starting the training loop")
    for epoch in range(epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, epochs, i + 1, total_step,

        loss.item()))
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))


torch.multiprocessing.set_start_method('fork')
processes = [mp.Process(target=train, args=(i,nr,gpus,world_size,epochs,batch_size)) for i in range(world_size)]

for p in processes:
    p.start()

for p in processes:
    p.join()

mp.spawn(train, args=(nr,gpus,world_size,epochs,batch_size),nprocs=world_size)

In [13]:
!pip install multiprocess



In [14]:
import multiprocess

In [15]:
with multiprocess.Pool(gpus) as pool:
    print(pool.starmap(train, [(i,nr,gpus,world_size,epochs,batch_size) for i in range(gpus)]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 263522689.78it/s]
100%|██████████| 9912422/9912422 [00:00<00:00, 118599004.57it/s]


Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/rawExtracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw


Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 176582644.06it/s]


Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-labels-idx1-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 894913483.00it/s]



Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw
Using downloaded and verified file: home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-labels-idx1-ubyte.gz
Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/train-labels-idx1-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Using downloaded and verified file: home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-images-idx3-ubyte.gz
Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-images-idx3-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to home/mohsin/hpc

100%|██████████| 4542/4542 [00:00<00:00, 32845739.26it/s]


Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-labels-idx1-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 25299507.00it/s]


Extracting home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw/t10k-labels-idx1-ubyte.gz to home/mohsin/hpc-saudi-2022/kubeflow-demo/data/MNIST/raw

Starting the training loop
Starting the training loop
Epoch [1/2], Step [100/1875], Loss: 2.1635
Epoch [1/2], Step [200/1875], Loss: 1.9377
Epoch [1/2], Step [300/1875], Loss: 1.9032
Epoch [1/2], Step [400/1875], Loss: 1.8220
Epoch [1/2], Step [500/1875], Loss: 1.7550
Epoch [1/2], Step [600/1875], Loss: 1.4583
Epoch [1/2], Step [700/1875], Loss: 1.5107
Epoch [1/2], Step [800/1875], Loss: 1.4549
Epoch [1/2], Step [900/1875], Loss: 1.3747
Epoch [1/2], Step [1000/1875], Loss: 1.2107
Epoch [1/2], Step [1100/1875], Loss: 1.2325
Epoch [1/2], Step [1200/1875], Loss: 1.1732
Epoch [1/2], Step [1300/1875], Loss: 1.1506
Epoch [1/2], Step [1400/1875], Loss: 0.8159
Epoch [1/2], Step [1500/1875], Loss: 1.2118
Epoch [1/2], Step [1600/1875], Loss: 0.9696
Epoch [1/2], Step [1700/1875], Loss: 0.7535
Epoch [1/2], Step [1800/1875], Loss: 0.7122
Epoch [2/2