In [None]:
import os, subprocess
import numpy as np
import torch
from torch import nn

In [None]:
def cpu_():
  return torch.device('cpu')
def gpu_():
  return torch.device('cuda')
def gpu_(i=0):
  return torch.device(f'cuda:{i}')
def num_gpu():
  return torch.cuda.device_count()
def try_gpu(i=0):
  if torch.cuda.device_count() >=i +1:
    return gpu_(i)
  return cpu_()

device_ = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
device_, gpu_(), torch.cuda.device_count(), try_gpu(0)

(device(type='cuda'),
 device(type='cuda', index=0),
 1,
 device(type='cuda', index=0))

In [None]:
a = torch.randn(size=(3, 3), device=device_)
b = torch.mm(a, a)
a.shape, a, b.shape, b

(torch.Size([3, 3]),
 tensor([[ 1.1504, -0.5589,  1.1304],
         [ 1.9525, -0.5406,  0.0241],
         [ 0.5134,  0.0660, -1.1161]], device='cuda:0'),
 torch.Size([3, 3]),
 tensor([[ 0.8124, -0.2663,  0.0253],
         [ 1.2030, -0.7975,  2.1673],
         [ 0.1464, -0.3962,  1.8276]], device='cuda:0'))

**Asynchronous Computation**

In [None]:
import time
class Timer:
  def __init__(self):
    self.times = []
    self.start()
  def start(self):
    self.tstart = time.time()
  def stop(self):
    self.times.append(time.time() - self.tstart)
    return self.times[-1]

In [None]:
class Benchmark:
  def __init__(self, desc='Complete'):
    self.desc = desc
  def __enter__(self):
    self.timer = Timer()
    return self
  def __exit__(self, *args):
    print(f'{self.desc}: {self.timer.stop():.4f} sec')

In [None]:
with Benchmark('numpy'):
  for _ in range(5):
    a = np.random.normal(size=(1000, 1000))
    b = np.dot(a, a)

with Benchmark('torch'):
  for _ in range(5):
    a = torch.randn(size=(1000, 1000))
    b = torch.mm(a, a)

numpy: 0.2732 sec
torch: 0.1974 sec


In [None]:
''' this is much faster in orderes of magnitude with PyTorch
by default GPU ops areasynchronous
the backend manages ts own threads that continually select and execute '''
with Benchmark('torch - synchronize'):
  for _ in range(5):
    a = torch.randn(size=(10000, 10000))
    b = torch.mm(a, a)
  torch.cuda.synchronize(device_)

torch - synchronize: 24.8038 sec


In [None]:
x = torch.ones((1, 2), device = device_)
y = torch.ones((1, 2), device = device_)
z = x *y +2
z

tensor([[3., 3.]], device='cuda:0')

**Parallel computation on GPUs**

In [None]:
try_gpu()
def run(x):
  return [x.mm(x) for _ in range(5)]

In [None]:
''' configuration with more than one GPU to perform
matrix-matrix calculations on device using data into two variables '''
x_gpu1 = torch.rand(size=(1000, 1000), device=try_gpu(i=0))
run(x_gpu1)
torch.cuda.synchronize(try_gpu(i=0))
with Benchmark('GPU1 time'):
  run(x_gpu1)
  torch.cuda.synchronize(try_gpu(i=0))
'''
x_gpu2 = torch.rand(size=(100, 100), device=try_gpu(i=1))
run(x_gpu2)
torch.cuda.synchronize(try_gpu(i=1))
with Benchmark('GPU2 time'):
  run(x_gpu2)
  torch.cuda.synchronize(try_gpu(i=1))
''' ;

GPU1 time: 0.0039577484130859375 sec


In [None]:
try_gpu(i=0)

device(type='cuda', index=0)

**Parallel Communication and Computation**

In [None]:
def move_to_cpu(x, non_blocking=False):
  return [y.to('cpu',  non_blocking=non_blocking) for y in x]

with Benchmark('run on GPU1 \t\t\t'):
  y = run(x_gpu1)
  torch.cuda.synchronize()

with Benchmark('copy/move to CPU \t\t'):
  y_to_cpu = move_to_cpu(y)
  torch.cuda.synchronize()

with Benchmark('Run on GPU and copy to CPU\t'):
  y = run(x_gpu1)
  y_to_cpu = move_to_cpu(y, True)
  torch.cuda.synchronize()

run on GPU1 			: 0.004096269607543945 sec
copy/move to CPU 		: 0.014218568801879883 sec
Run on GPU and copy to CPU	: 0.005823850631713867 sec


**Hardware**

In [None]:
'''
processor/RAM/Ethernet/Expnasion bus (GPUs)/Storage with PCIbus
Vectorization/Cache L1, L2 & L3/GPUs and Accelerators
common latency numbers maintained on github/vemdor
'''

Hybridize - Sequential class

In [None]:
def mlp_network():
  mlp_ = nn.Sequential(
      nn.Linear(512, 256),
      nn.ReLU(),
      nn.Linear(256, 128),
      nn.ReLU(),
      nn.Linear(128, 2))
  return mlp_
x = torch.randn(size=(1, 512))
mlp_ = mlp_network()
mlp_(x)

tensor([[-0.1584,  0.1160]], grad_fn=<AddmmBackward0>)

In [None]:
mlp__ = torch.jit.script(mlp_)
mlp__(x)

tensor([[-0.1584,  0.1160]], grad_fn=<AddmmBackward0>)

In [None]:
mlp_net = mlp_network()
with Benchmark('not using torchscript \t'):
  for _ in range(1000): mlp_net(x)

mlp_net_jit = torch.jit.script(mlp_net)
with Benchmark('using torchscript \t'):
  for _ in range(1000): mlp_net_jit(x)

not using torchscript 	: 0.0739 sec
using torchscript 	: 0.0595 sec


**Serialization**

In [None]:
mlp_net_jit.save('my_mlp')
!ls -lh my_mlp

-rw-r--r-- 1 root root 652K Jun 11 16:00 my_mlp


**Multiple-GPUs**

In [None]:
''' dataset/dataloader '''
import torchvision
from torchvision import datasets

from torchvision.transforms import transforms
transformations = [transforms.ToTensor()]
transformations = transforms.Compose(transformations)
train_set = datasets.FashionMNIST(
    root='/content/', train=True, transform=transformations, download=True)
test_set = datasets.FashionMNIST(
    root='/content/', train=False, transform=transformations, download=True)
train_dataloader = torch.utils.data.DataLoader(
    train_set, batch_size=32, shuffle=True, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(
    test_set, batch_size=32, shuffle=False, num_workers=2)
next(iter(train_dataloader))[0], next(iter(test_dataloader))[0]
len(train_dataloader), len(test_dataloader)

100%|██████████| 26.4M/26.4M [00:01<00:00, 19.2MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 307kB/s]
100%|██████████| 4.42M/4.42M [00:00<00:00, 5.62MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 14.1MB/s]


(1875, 313)

In [None]:
class Residual(nn.Module):
  def __init__(self, num_channels, use_1x1conv=False, strides=1):
    super().__init__()
    self.conv1 = nn.LazyConv2d(num_channels, kernel_size=3, padding=1, stride=strides)
    self.conv2 = nn.LazyConv2d(num_channels, kernel_size=3, padding=1)
    if use_1x1conv:
      self.conv3 = nn.LazyConv2d(num_channels, kernel_size=1, stride=strides)
    else:
      self.conv3 = None
    self.batch_norm1 = nn.LazyBatchNorm2d()
    self.batch_norm2 = nn.LazyBatchNorm2d()

  def forward(self, X):
    Y = nn.ReLU(self.batch_norm1(self.conv1(X)))
    Y = self.batch_norm2(self.conv2(Y))
    if self.conv3:
      X = self.conv3(X)
    Y += X
    return nn.ReLU(Y)

In [None]:
''' Demo Network for Parallelization in Training '''
def my_resnet(num_classes, in_channels =1):
  def resnet_block(in_channels, out_channels, num_residuals, first_blk=False):
    blk = []
    for i in range(num_residuals):
      if i ==0 and not first_blk:
        blk.append(Residual(out_channels, use_1x1conv=True))
      else:
        blk.append(Residual(out_channels))
    return nn.Sequential(*blk)

  network_ = nn.Sequential(
      nn.Conv2d(in_channels, 64, kernel_size =1, stride=1, padding =1),
      nn.BatchNorm2d(64), nn.ReLU())
  network_.add_module('res-blk-1', resnet_block(64, 64, 2, first_blk=True))
  network_.add_module('res-blk-2', resnet_block(64, 128, 2))
  network_.add_module('res-blk-3', resnet_block(128, 126, 2))
  network_.add_module('res-blk-4', resnet_block(256, 512, 2))
  network_.add_module('global-avg-pool', nn.AdaptiveAvgPool2d((1, 1)))
  network_.add_module(
      'fc', nn.Sequential(nn.Flatten(), nn.Linear(512, num_classes)))
  return network_

In [None]:
network = my_resnet(1, 10)

In [None]:
''' Training

Training for parallelsim
- network params initialized across devices
- iterating over the mini batches they are to be divided across the devices
- compute loss and gradients across devices
- gradients aggregated and parameters updated

Compute the accuracy in parallel to report the final performance of netwrk
'''

In [None]:
def train(network, num_gpus, batch_size, lr):
  train_iter, test_iter = train_dataloader
  # devices = [try_gpu(i) for i in range(num_gpus)]
  devices = try_gpu(i=0)
  def init_weights(module):
    if type(module) in [nn.Linear, nn.Conv2d]:
      nn.init.normal_(module.weight, std=0.0)
  network.apply(init_weights)
  network = nn.DataParallel(network, device_ids = devices)
  trainer = torch.optim.SGD(network.parameters(), lr)
  loss = nn.CrossEntropyLoss()
  timer, num_epochs = Timer(), 10
  # visualize with 'epoch', 'test acc'
  for epoch in range(num_epochs):
    network.train()
    timer.start()
    for X, y in train_iter:
      trainer.zer_grad()
      X, y = X.to(devices[0]), y.to(devices[0])
      l = loss(network(X), y)
      l.backward()
      trainer.step()
    timer.stop()
    # visualize steps with acceleration devices[0]
  # same on the print statement and accuracy, with acceleration

In [None]:
try_gpu(i=0), num_gpu()

(device(type='cuda', index=0), 1)

In [None]:
train(my_resnet, num_gpus=1, batch_size=256, lr=0.1)
train(my_resnet, num_gpus=2, batch_size=256, lr=0.2)

Parameter Servers

In [None]:
scale = 0.01
W1 = torch.randn(size=(20, 1, 3, 3)) * scale
b1 = torch.zeros(20)
W2 = torch.randn(size=(50, 20, 5, 5)) * scale
b2 = torch.zeros(50)
W3 = torch.randn(size=(100, 128)) * scale
b3 = torch.zeros(128)
W4 = torch.randn(size=(128, 10)) * scale
b4 = torch.zeros(10)
params = [W1, b1, W2, b2, W3, b3, W4, b4]

Data Synchronization

In [None]:
def get_parameters(params, device):
  new_params = [p.to(device) for p in params]
  for p in new_params:
    p.requires_grad_()
  return new_params

new_params = get_parameters(params, try_gpu(i=0))
new_params[1], new_params[1].grad,

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        device='cuda:0', requires_grad=True),
 None)

In [None]:
def allreduce(data):
  for i in range(1, len(data)):
    data[0][:] += data[i].to(data[0].device)

  for i in range(1, len(data)):
    data[i][:] = data[0].to(data[i].device)


In [None]:
''' use case with more than one gpu '''
data = [torch.ones((1, 2), device=try_gpu(i=0)) for _ in range(2)]
print(data[0], data[1])
allreduce(data)
print(data[0], data[1])

tensor([[1., 1.]], device='cuda:0') tensor([[1., 1.]], device='cuda:0')
tensor([[2., 2.]], device='cuda:0') tensor([[2., 2.]], device='cuda:0')


Distribute data

In [None]:
data = torch.arange(20).reshape(4, 5)
# devices = [torch.device('cuda:0'), torch.device('cuda:1')]
devices = [torch.device('cuda:0')]
split = nn.parallel.scatter(data, devices)
data, devices, split

(tensor([[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9],
         [10, 11, 12, 13, 14],
         [15, 16, 17, 18, 19]]),
 [device(type='cuda', index=0)],
 (tensor([[ 0,  1,  2,  3,  4],
          [ 5,  6,  7,  8,  9],
          [10, 11, 12, 13, 14],
          [15, 16, 17, 18, 19]], device='cuda:0'),))

In [None]:
def split_batch(X, y, devices):
  return (nn.parallel.scatter(X, devices), nn.parallel.scatter(y, devices))

Training

In [None]:
''' training function '''
loss = nn.CrossEntropyLoss(reduction='none')
def train_batch(X, y, device_params, devices, lr):
  X_shards, y_shards = split_batch(X, y, devices)
  # loss calculated separately on each gpu
  loss_ = [loss(network(X_shard, device_W), y_shard).sum()
        for X_shard, y_shard, device_W
            in zip(X_shards, y_shards, device_params)]
  # backpropagation performed separately on each gpu
  for l in loss_:
    l.backward()
  # summ all gradients from each gpu and brodcast- one operation
  with torch.no_grad():
    for i in range(len(device_params[0])):
      allreduce([device_params[c][i].grad for c in range(len(device_params))])
  # model parameters updated separately on each gpu
  for param in device_params:
    SGD(param, lr, X.shape[0])

In [None]:
''' Train loop
in the train loop, for each X, y perform gpu training on minibatch, and add synchronization

train_batch(X, y, device_params, devices, lr)
torch.cuda.synchronize()
'''