In [4]:
import os, time
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.cpp_extension import load_inline
from torchvision import datasets, transforms
import torch.optim as optim
import datetime

device = "cuda"

torch.manual_seed(0)

<torch._C.Generator at 0x7f8e7dbe9910>

In [5]:
batch_size = 32
train_transform = transforms.Compose([
    transforms.RandomCrop(96, padding=12),      
    transforms.RandomHorizontalFlip(p=0.5),   
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), 
    transforms.RandomRotation(10),            
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
dataset1 = datasets.STL10('/storage/dmls/stl10_data', split='train',
                         transform=train_transform)
dataset2 = datasets.STL10('/storage/dmls/stl10_data', split='test',
                         transform=test_transform)
train_loader = torch.utils.data.DataLoader(dataset1,batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True, persistent_workers=True)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True, persistent_workers=True)

In [6]:
def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

In [7]:
def train_model_fast(model, train_loader, test_loader, device, 
                       epochs=10, max_lr=1e-3):
    
    optimizer = optim.AdamW(model.parameters(), lr=max_lr, weight_decay=5e-2)
    criterion = nn.CrossEntropyLoss()
    
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=max_lr,
        epochs=epochs,
        steps_per_epoch=len(train_loader)
    )
    
    start = time.time()
    cuda_mem = 0
    
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        

        for images, labels in train_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad(set_to_none=True)
            
            loss.backward()
            optimizer.step()
            
            scheduler.step()
            cuda_mem = torch.cuda.max_memory_allocated(device=device)

            running_loss += loss.item() * labels.size(0)

        avg_loss = running_loss / len(train_loader.dataset)
        
        acc = evaluate(model, test_loader, device)
        
        current_lr = optimizer.param_groups[0]['lr']
        
        print(
            f"Rank={device} "
            f"Epoch {epoch}/{epochs}: "
            f"loss={avg_loss:.4f}, "
            f"test_acc={acc*100:.2f}%, "
            f"LR={current_lr:.6f}, "
        )
        
    end = time.time()

    print(f"Rank={device} Total Time={end-start}")
    print(f"Rank={device} Mem Usage={cuda_mem / (1024**2)} MB")
    
    return model

In [8]:
class DepthwiseSeparableBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        
        
        self.depthwise = nn.Conv2d(
            in_channels, 
            in_channels, 
            kernel_size=3, 
            padding=1, 
            stride=stride, 
            groups=in_channels, 
            bias=False
        )
        self.bn_dw = nn.BatchNorm2d(in_channels)
        
        
        self.pointwise = nn.Conv2d(
            in_channels, 
            out_channels, 
            kernel_size=1, 
            stride=1, 
            bias=False
        )
        self.bn_pw = nn.BatchNorm2d(out_channels)
        
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        
        out = F.relu(self.bn_dw(self.depthwise(x)))
        out = self.bn_pw(self.pointwise(out))
        out += self.shortcut(x) 
        out = F.relu(out)
        return out

class CNN(nn.Module):
    def __init__(self, in_channel=3, num_class=10):
        super().__init__()
        
        
        
        self.conv1 = nn.Conv2d(in_channel, 64, kernel_size=3, padding=1, stride=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(2) 

        
        self.block1 = DepthwiseSeparableBlock(64, 128, stride=2)  
        self.block2 = DepthwiseSeparableBlock(128, 256, stride=2) 
        self.block3 = DepthwiseSeparableBlock(256, 256, stride=2) 
        
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 
        
        
        self.dropout = nn.Dropout(p=0.2) 
        self.fc = nn.Linear(256, num_class) 

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        
        return x

In [9]:
model = CNN(in_channel=3, num_class=10).to(device)

In [10]:
print("=== Training ===")
device = "cuda"

stats = train_model_fast(model, train_loader, test_loader, device, epochs=15)

=== Training ===
Rank=cuda Epoch 1/15: loss=2.1743, test_acc=26.38%, LR=0.000153, 
Rank=cuda Epoch 2/15: loss=1.8903, test_acc=35.16%, LR=0.000438, 
Rank=cuda Epoch 3/15: loss=1.6732, test_acc=41.05%, LR=0.000761, 
Rank=cuda Epoch 4/15: loss=1.5576, test_acc=42.56%, LR=0.000972, 
Rank=cuda Epoch 5/15: loss=1.4481, test_acc=46.24%, LR=0.000994, 
Rank=cuda Epoch 6/15: loss=1.3617, test_acc=50.89%, LR=0.000950, 
Rank=cuda Epoch 7/15: loss=1.2771, test_acc=55.83%, LR=0.000866, 
Rank=cuda Epoch 8/15: loss=1.2190, test_acc=54.45%, LR=0.000749, 
Rank=cuda Epoch 9/15: loss=1.1686, test_acc=58.01%, LR=0.000610, 
Rank=cuda Epoch 10/15: loss=1.1122, test_acc=60.34%, LR=0.000462, 
Rank=cuda Epoch 11/15: loss=1.0793, test_acc=62.71%, LR=0.000316, 
Rank=cuda Epoch 12/15: loss=1.0468, test_acc=63.64%, LR=0.000188, 
Rank=cuda Epoch 13/15: loss=1.0010, test_acc=64.22%, LR=0.000086, 
Rank=cuda Epoch 14/15: loss=0.9759, test_acc=64.54%, LR=0.000022, 
Rank=cuda Epoch 15/15: loss=0.9704, test_acc=64.84%, L

## Multi GPU

In [None]:
%%writefile train_ddp.py
import os, time
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.cpp_extension import load_inline
from torchvision import datasets, transforms
import torch.optim as optim
import datetime

import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

import argparse

EPOCHS = 15


def train_model_fast(model, train_loader, test_loader, device, 
                       epochs=10, max_lr=1e-3):
    
    optimizer = optim.AdamW(model.parameters(), lr=max_lr, weight_decay=5e-2)
    criterion = nn.CrossEntropyLoss()
    
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=max_lr,
        epochs=epochs,
        steps_per_epoch=len(train_loader)
    )
    
    start = time.time()
    cuda_mem = 0
    
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        

        for images, labels in train_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad(set_to_none=True)
            
            loss.backward()
            optimizer.step()
            
            scheduler.step()
            cuda_mem = torch.cuda.max_memory_allocated(device=device)

            running_loss += loss.item() * labels.size(0)

        avg_loss = running_loss / len(train_loader.dataset)
        
        acc = evaluate(model, test_loader, device)
        
        current_lr = optimizer.param_groups[0]['lr']
        
        print(
            f"Rank={device} "
            f"Epoch {epoch}/{epochs}: "
            f"loss={avg_loss:.4f}, "
            f"test_acc={acc*100:.2f}%, "
            f"LR={current_lr:.6f}, "
        )
        
    end = time.time()

    print(f"Rank={device} Total Time={end-start}")
    print(f"Rank={device} Mem Usage={cuda_mem / (1024**2)} MB")
    
    return model

def evaluate(model, loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

class DepthwiseSeparableBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        
        
        self.depthwise = nn.Conv2d(
            in_channels, 
            in_channels, 
            kernel_size=3, 
            padding=1, 
            stride=stride, 
            groups=in_channels, 
            bias=False
        )
        self.bn_dw = nn.BatchNorm2d(in_channels)
        
        
        self.pointwise = nn.Conv2d(
            in_channels, 
            out_channels, 
            kernel_size=1, 
            stride=1, 
            bias=False
        )
        self.bn_pw = nn.BatchNorm2d(out_channels)
        
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        
        out = F.relu(self.bn_dw(self.depthwise(x)))
        out = self.bn_pw(self.pointwise(out))
        out += self.shortcut(x) 
        out = F.relu(out)
        return out

class CNN(nn.Module):
    def __init__(self, in_channel=3, num_class=10):
        super().__init__()
        
        
        
        self.conv1 = nn.Conv2d(in_channel, 64, kernel_size=3, padding=1, stride=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(2) 

        
        self.block1 = DepthwiseSeparableBlock(64, 128, stride=2)  
        self.block2 = DepthwiseSeparableBlock(128, 256, stride=2) 
        self.block3 = DepthwiseSeparableBlock(256, 256, stride=2) 
        
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 
        
        
        self.dropout = nn.Dropout(p=0.2) 
        self.fc = nn.Linear(256, num_class) 

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        
        return x
    
def setup(rank, world_size, master_port, backend, timeout):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = master_port
    dist.init_process_group(backend=backend, rank=rank, world_size=world_size, timeout=timeout)
    
def load_data(rank, world_size, batch_size):
    train_transform = transforms.Compose([
        transforms.RandomCrop(96, padding=12),      
        transforms.RandomHorizontalFlip(p=0.5),   
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), 
        transforms.RandomRotation(10),            
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    ])
    
    train_set = datasets.STL10('/storage/dmls/stl10_data', split='train', transform=train_transform)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set,
        num_replicas=world_size,
        rank=rank
    )

    train_loader = torch.utils.data.DataLoader(
        dataset=train_set,
        sampler=train_sampler,
        batch_size=batch_size,
        shuffle=False,
        persistent_workers=True,
        num_workers=1,
        pin_memory=True
    )

    test_set = datasets.STL10('/storage/dmls/stl10_data', split='test', transform=test_transform)
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_set,
        num_replicas=world_size,
        rank=rank
    )

    test_loader = torch.utils.data.DataLoader(
        dataset=test_set,
        sampler=test_sampler,
        batch_size=batch_size,
        shuffle=False,
        persistent_workers=True,
        num_workers=1,
        pin_memory=True
    )

    return train_loader, test_loader

def train_dist(rank, world_size, master_port, backend, timeout, batch_size):
    setup(rank, world_size, master_port, backend, timeout)
    torch.cuda.set_device(rank)
    train_loader, test_loader = load_data(rank, world_size, batch_size)
    model = CNN(in_channel=3, num_class=10).to(rank)
    ddp_model = DDP(model, device_ids=[rank])
    train_model_fast(ddp_model, train_loader, test_loader, rank, epochs=EPOCHS)
    dist.destroy_process_group()
    
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--backend', type=str, default='nccl')
    args = parser.parse_args()

    world_size = torch.cuda.device_count()
    master_port = '8282'
    timeout = datetime.timedelta(seconds=10)
    
    start_time = time.time()
    mp.spawn(
        train_dist, 
        nprocs=world_size, 
        args=(world_size, master_port, args.backend, timeout, args.batch_size), 
        join=True
    )
    end_time = time.time()
    print("Total time: {}".format(end_time - start_time))

Overwriting train_ddp.py


In [4]:
!python train_ddp.py

Rank=0 Epoch 1/15: loss=1.1100, test_acc=27.85%, LR=0.000153, 
Rank=1 Epoch 1/15: loss=1.1080, test_acc=28.50%, LR=0.000153, 
Rank=0 Epoch 2/15: loss=0.9687, test_acc=36.33%, LR=0.000439, 
Rank=1 Epoch 2/15: loss=0.9663, test_acc=34.38%, LR=0.000439, 
Rank=0 Epoch 3/15: loss=0.8702, test_acc=41.45%, LR=0.000762, 
Rank=1 Epoch 3/15: loss=0.8575, test_acc=40.90%, LR=0.000762, 
Rank=0 Epoch 4/15: loss=0.8058, test_acc=43.38%, LR=0.000972, 
Rank=1 Epoch 4/15: loss=0.7981, test_acc=44.07%, LR=0.000972, 
Rank=0 Epoch 5/15: loss=0.7552, test_acc=46.12%, LR=0.000994, 
Rank=1 Epoch 5/15: loss=0.7482, test_acc=45.77%, LR=0.000994, 
Rank=0 Epoch 6/15: loss=0.7056, test_acc=51.65%, LR=0.000950, 
Rank=1 Epoch 6/15: loss=0.7100, test_acc=50.72%, LR=0.000950, 
Rank=0 Epoch 7/15: loss=0.6757, test_acc=54.07%, LR=0.000865, 
Rank=1 Epoch 7/15: loss=0.6744, test_acc=54.02%, LR=0.000865, 
Rank=0 Epoch 8/15: loss=0.6396, test_acc=55.30%, LR=0.000748, 
Rank=1 Epoch 8/15: loss=0.6267, test_acc=53.85%, LR=0.0

# Q3

In [5]:
!python train_ddp.py --batch_size 16

Rank=0 Epoch 1/15: loss=1.1018, test_acc=30.15%, LR=0.000153, 
Rank=1 Epoch 1/15: loss=1.0995, test_acc=30.03%, LR=0.000153, 
Rank=0 Epoch 2/15: loss=0.9478, test_acc=38.88%, LR=0.000438, 
Rank=1 Epoch 2/15: loss=0.9442, test_acc=38.65%, LR=0.000438, 
Rank=0 Epoch 3/15: loss=0.8511, test_acc=41.75%, LR=0.000761, 
Rank=1 Epoch 3/15: loss=0.8475, test_acc=40.60%, LR=0.000761, 
Rank=0 Epoch 4/15: loss=0.8059, test_acc=45.10%, LR=0.000972, 
Rank=1 Epoch 4/15: loss=0.7898, test_acc=43.70%, LR=0.000972, 
Rank=0 Epoch 5/15: loss=0.7525, test_acc=46.62%, LR=0.000994, 
Rank=1 Epoch 5/15: loss=0.7455, test_acc=48.20%, LR=0.000994, 
Rank=0 Epoch 6/15: loss=0.7138, test_acc=50.48%, LR=0.000950, 
Rank=1 Epoch 6/15: loss=0.7010, test_acc=51.00%, LR=0.000950, 
Rank=1 Epoch 7/15: loss=0.6630, test_acc=53.17%, LR=0.000866, 
Rank=0 Epoch 7/15: loss=0.6728, test_acc=54.45%, LR=0.000866, 
Rank=0 Epoch 8/15: loss=0.6375, test_acc=56.83%, LR=0.000749, 
Rank=1 Epoch 8/15: loss=0.6382, test_acc=56.65%, LR=0.0

In [6]:
!python train_ddp.py --batch_size 32

Rank=0 Epoch 1/15: loss=1.1159, test_acc=27.12%, LR=0.000153, 
Rank=1 Epoch 1/15: loss=1.1214, test_acc=26.47%, LR=0.000153, 
Rank=1 Epoch 2/15: loss=0.9696, test_acc=33.23%, LR=0.000439, 
Rank=0 Epoch 2/15: loss=0.9769, test_acc=35.08%, LR=0.000439, 
Rank=1 Epoch 3/15: loss=0.8654, test_acc=39.25%, LR=0.000762, 
Rank=0 Epoch 3/15: loss=0.8743, test_acc=39.62%, LR=0.000762, 
Rank=0 Epoch 4/15: loss=0.8081, test_acc=45.48%, LR=0.000972, 
Rank=1 Epoch 4/15: loss=0.7997, test_acc=44.45%, LR=0.000972, 
Rank=1 Epoch 5/15: loss=0.7522, test_acc=45.98%, LR=0.000994, 
Rank=0 Epoch 5/15: loss=0.7530, test_acc=45.70%, LR=0.000994, 
Rank=1 Epoch 6/15: loss=0.7045, test_acc=45.40%, LR=0.000950, 
Rank=0 Epoch 6/15: loss=0.7225, test_acc=45.60%, LR=0.000950, 
Rank=1 Epoch 7/15: loss=0.6675, test_acc=53.33%, LR=0.000865, 
Rank=0 Epoch 7/15: loss=0.6762, test_acc=53.42%, LR=0.000865, 
Rank=0 Epoch 8/15: loss=0.6462, test_acc=53.73%, LR=0.000748, 
Rank=1 Epoch 8/15: loss=0.6399, test_acc=52.80%, LR=0.0

In [7]:
!python train_ddp.py --batch_size 64

Rank=1 Epoch 1/15: loss=1.1376, test_acc=23.72%, LR=0.000154, 
Rank=0 Epoch 1/15: loss=1.1384, test_acc=24.18%, LR=0.000154, 
Rank=1 Epoch 2/15: loss=1.0158, test_acc=31.45%, LR=0.000440, 
Rank=0 Epoch 2/15: loss=1.0125, test_acc=32.05%, LR=0.000440, 
Rank=0 Epoch 3/15: loss=0.9112, test_acc=38.98%, LR=0.000765, 
Rank=1 Epoch 3/15: loss=0.9132, test_acc=37.20%, LR=0.000765, 
Rank=1 Epoch 4/15: loss=0.8288, test_acc=39.32%, LR=0.000974, 
Rank=0 Epoch 4/15: loss=0.8417, test_acc=40.05%, LR=0.000974, 
Rank=0 Epoch 5/15: loss=0.7852, test_acc=45.52%, LR=0.000994, 
Rank=1 Epoch 5/15: loss=0.7785, test_acc=43.80%, LR=0.000994, 
Rank=0 Epoch 6/15: loss=0.7570, test_acc=47.58%, LR=0.000949, 
Rank=1 Epoch 6/15: loss=0.7403, test_acc=45.45%, LR=0.000949, 
Rank=0 Epoch 7/15: loss=0.7184, test_acc=48.27%, LR=0.000864, 
Rank=1 Epoch 7/15: loss=0.7081, test_acc=46.90%, LR=0.000864, 
Rank=1 Epoch 8/15: loss=0.6865, test_acc=51.85%, LR=0.000747, 
Rank=0 Epoch 8/15: loss=0.6824, test_acc=52.85%, LR=0.0

In [8]:
!python train_ddp.py --batch_size 128

Rank=1 Epoch 1/15: loss=1.1601, test_acc=15.07%, LR=0.000155, 
Rank=0 Epoch 1/15: loss=1.1614, test_acc=14.57%, LR=0.000155, 
Rank=0 Epoch 2/15: loss=1.0527, test_acc=25.90%, LR=0.000444, 
Rank=1 Epoch 2/15: loss=1.0567, test_acc=27.47%, LR=0.000444, 
Rank=0 Epoch 3/15: loss=0.9528, test_acc=35.10%, LR=0.000770, 
Rank=1 Epoch 3/15: loss=0.9507, test_acc=33.88%, LR=0.000770, 
Rank=0 Epoch 4/15: loss=0.8695, test_acc=40.58%, LR=0.000976, 
Rank=1 Epoch 4/15: loss=0.8713, test_acc=40.58%, LR=0.000976, 
Rank=0 Epoch 5/15: loss=0.8110, test_acc=42.27%, LR=0.000993, 
Rank=1 Epoch 5/15: loss=0.7963, test_acc=40.62%, LR=0.000993, 
Rank=1 Epoch 6/15: loss=0.7606, test_acc=44.60%, LR=0.000947, 
Rank=0 Epoch 6/15: loss=0.7657, test_acc=45.05%, LR=0.000947, 
Rank=1 Epoch 7/15: loss=0.7238, test_acc=48.27%, LR=0.000861, 
Rank=0 Epoch 7/15: loss=0.7337, test_acc=47.70%, LR=0.000861, 
Rank=1 Epoch 8/15: loss=0.7011, test_acc=49.18%, LR=0.000743, 
Rank=0 Epoch 8/15: loss=0.7051, test_acc=49.05%, LR=0.0

# Q4

In [9]:
!python train_ddp.py --batch_size 32 --backend nccl

Rank=0 Epoch 1/15: loss=1.1172, test_acc=24.77%, LR=0.000153, 
Rank=1 Epoch 1/15: loss=1.1139, test_acc=25.85%, LR=0.000153, 
Rank=0 Epoch 2/15: loss=0.9897, test_acc=34.15%, LR=0.000439, 
Rank=1 Epoch 2/15: loss=0.9817, test_acc=32.95%, LR=0.000439, 
Rank=0 Epoch 3/15: loss=0.8722, test_acc=38.73%, LR=0.000762, 
Rank=1 Epoch 3/15: loss=0.8648, test_acc=38.10%, LR=0.000762, 
Rank=1 Epoch 4/15: loss=0.7914, test_acc=42.98%, LR=0.000972, 
Rank=0 Epoch 4/15: loss=0.8037, test_acc=42.50%, LR=0.000972, 
Rank=0 Epoch 5/15: loss=0.7632, test_acc=48.70%, LR=0.000994, 
Rank=1 Epoch 5/15: loss=0.7518, test_acc=47.00%, LR=0.000994, 
Rank=0 Epoch 6/15: loss=0.7124, test_acc=50.65%, LR=0.000950, 
Rank=1 Epoch 6/15: loss=0.7039, test_acc=48.65%, LR=0.000950, 
Rank=0 Epoch 7/15: loss=0.6764, test_acc=53.40%, LR=0.000865, 
Rank=1 Epoch 7/15: loss=0.6819, test_acc=52.88%, LR=0.000865, 
Rank=1 Epoch 8/15: loss=0.6465, test_acc=52.80%, LR=0.000748, 
Rank=0 Epoch 8/15: loss=0.6528, test_acc=53.47%, LR=0.0

In [10]:
!python train_ddp.py --batch_size 128 --backend nccl

Rank=1 Epoch 1/15: loss=1.1561, test_acc=16.90%, LR=0.000155, 
Rank=0 Epoch 1/15: loss=1.1577, test_acc=16.38%, LR=0.000155, 
Rank=0 Epoch 2/15: loss=1.0503, test_acc=25.82%, LR=0.000444, 
Rank=1 Epoch 2/15: loss=1.0589, test_acc=26.67%, LR=0.000444, 
Rank=0 Epoch 3/15: loss=0.9510, test_acc=33.67%, LR=0.000770, 
Rank=1 Epoch 3/15: loss=0.9476, test_acc=34.10%, LR=0.000770, 
Rank=1 Epoch 4/15: loss=0.8671, test_acc=37.20%, LR=0.000976, 
Rank=0 Epoch 4/15: loss=0.8718, test_acc=37.60%, LR=0.000976, 
Rank=0 Epoch 5/15: loss=0.8089, test_acc=42.40%, LR=0.000993, 
Rank=1 Epoch 5/15: loss=0.7947, test_acc=41.68%, LR=0.000993, 
Rank=0 Epoch 6/15: loss=0.7695, test_acc=45.85%, LR=0.000947, 
Rank=1 Epoch 6/15: loss=0.7625, test_acc=44.35%, LR=0.000947, 
Rank=0 Epoch 7/15: loss=0.7409, test_acc=47.27%, LR=0.000861, 
Rank=1 Epoch 7/15: loss=0.7343, test_acc=47.08%, LR=0.000861, 
Rank=0 Epoch 8/15: loss=0.7157, test_acc=48.70%, LR=0.000743, 
Rank=1 Epoch 8/15: loss=0.6989, test_acc=48.80%, LR=0.0

In [11]:
!python train_ddp.py --batch_size 32 --backend gloo

[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
Rank=1 Epoch 1/15: loss=1.1161, test_acc=27.43%, LR=0.000153, 
Rank=0 Epoch 1/15: loss=1.1224, test_acc=28.07%, LR=0.000153, 
Rank=1 Epoch 2/15: loss=0.9773, test_acc=34.80%, LR=0.000439, 
Rank=0 Epoch 2/15: loss=0.9775, test_acc=37.00%, LR=0.000439, 
Rank=1 Epoch 3/15: loss=0.8613, test_acc=38.95%, LR=0.000762, 
Rank=0 Epoch 3/15: loss=0.8723, test_acc=38.30%, LR=0.000762, 
Rank=1 Epoch 4/15: loss=0.7984, test_acc=43.68%, LR=0.000972, 
Rank=0 Epoch 4/15: loss=0.8109, test_acc=43.53%, LR=0.000972, 
Rank=1 Epoch 5/15: loss=0.7543, test_acc=48.05%, LR=0.000994, 
Rank=0 Epoch 5/15: loss=0.7601, test_acc=47.93%, LR=0.000994, 
Rank=1 Epoch 6/15: loss=0.7026, test_acc=50.68%, LR=0.000950, 
Rank=0 Epoch 6/15: loss=0.7111, test_acc=50.10%, LR=0.000950, 
Rank=0 Epoch 7/15: loss=0.6824, test_acc=51.35%, LR=0.000865, 

In [12]:
!python train_ddp.py --batch_size 128 --backend gloo

[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
Rank=0 Epoch 1/15: loss=1.1625, test_acc=17.10%, LR=0.000155, 
Rank=1 Epoch 1/15: loss=1.1590, test_acc=16.57%, LR=0.000155, 
Rank=0 Epoch 2/15: loss=1.0588, test_acc=29.07%, LR=0.000444, 
Rank=1 Epoch 2/15: loss=1.0619, test_acc=29.35%, LR=0.000444, 
Rank=0 Epoch 3/15: loss=0.9568, test_acc=34.58%, LR=0.000770, 
Rank=1 Epoch 3/15: loss=0.9538, test_acc=33.25%, LR=0.000770, 
Rank=0 Epoch 4/15: loss=0.8741, test_acc=41.23%, LR=0.000976, 
Rank=1 Epoch 4/15: loss=0.8639, test_acc=39.98%, LR=0.000976, 
Rank=0 Epoch 5/15: loss=0.8091, test_acc=42.98%, LR=0.000993, 
Rank=1 Epoch 5/15: loss=0.7986, test_acc=40.62%, LR=0.000993, 
Rank=0 Epoch 6/15: loss=0.7668, test_acc=44.75%, LR=0.000947, 
Rank=1 Epoch 6/15: loss=0.7616, test_acc=43.10%, LR=0.000947, 
Rank=0 Epoch 7/15: loss=0.7368, test_acc=45.82%, LR=0.000861, 