# Hybrid Parallelism Components (Simulated) for Pytorch's Conv2d

* 2 modified components: data parallelism & model parallelism.

*I'm not very sure of pytorch detailed implementation of `nn.Dataparallel` function, so these components are kind-of upper-level simulations (but indeed work).*

### Reference: [HyPar: Towards Hybrid Parallelism for Deep Learning Accelerator Array](https://arxiv.org/abs/1901.02067)

![dp/mp](fig/dp&mp.png)

In [1]:
import torch
import torch.nn as nn
from torch.multiprocessing import Queue
import copy

### Note:
 
Pytorch uses a asynchronized way to invoke cudnn operations automaticly

In [2]:
class model_parallel_conv2d(nn.Module):
    def __init__(self, conv, gpus):
        super(model_parallel_conv2d, self).__init__()
        self.in_channels = conv.in_channels
        self.out_channels = conv.out_channels
        self.kernel_size = conv.kernel_size
        self.stride = conv.stride
        self.padding = conv.padding
        self.dilation = conv.dilation
        
        num_split = len(gpus)
        self.gpus = gpus
        self.num_split = num_split
        
        bulk = self.in_channels // num_split
        self.in_channels_split = [bulk for i in range(num_split-1)]
        self.in_channels_split.append(self.in_channels-(num_split-1)*bulk)
        self.convs = nn.ModuleList()
        offset = 0
        bias = False
        for i in range(num_split):
            if i==num_split-1:
                bias=True
            conv_split = nn.Conv2d(self.in_channels_split[i],
                             self.out_channels,
                             self.kernel_size,
                             self.stride,
                             self.padding,
                             self.dilation,
                             bias=bias)
            conv_split.weight = nn.Parameter(
                conv.weight[:,offset:offset+self.in_channels_split[i],:,:].cuda(gpus[i]))
            if bias:
                conv_split.bias = nn.Parameter(conv.bias.cuda(gpus[i]))
            self.convs.append(conv_split)
            offset+= self.in_channels_split[i]
        
    def forward(self, x):
        feature_list = [None] * self.num_split
        parallel_type = ''
        if type(x)!=tuple:
            inputs = [x.cuda(g) for g in self.gpus]
        else:
            inputs, parallel_type = x
        offset = 0
        if parallel_type == 'dp':
            for i in range(self.num_split):
                in_slice = \
                    torch.cat([
                        inputs[k][:,
                                  offset:offset+self.in_channels_split[i],
                                  :,:].cuda(self.gpus[i])
                        for k in range(self.num_split)], 0)
                feature_list[i] = self.convs[i](in_slice)
                offset+= self.in_channels_split[i]
        else:
            for i in range(self.num_split):
                feature_list[i] = \
                    self.convs[i](inputs[i][:,offset:offset+self.in_channels_split[i],:,:])
                offset+= self.in_channels_split[i]
        
        tmp_out = sum([f.cuda(self.gpus[0]) for f in feature_list])
        for i in range(self.num_split):
            feature_list[i] = tmp_out.cuda(self.gpus[i])
        return feature_list, 'mp'
        

In [3]:
class data_parallel_conv2d(nn.Module):
    def __init__(self, conv, gpus):
        super(data_parallel_conv2d, self).__init__()
        self.in_channels = conv.in_channels
        self.out_channels = conv.out_channels
        self.kernel_size = conv.kernel_size
        self.stride = conv.stride
        self.padding = conv.padding
        self.dilation = conv.dilation
        
        self.gpus = gpus
        self.convs = nn.ModuleList([
            copy.deepcopy(conv).cuda(i) for i in gpus
        ])
    
    def forward(self, x):
        parallel_type = ''
        if type(x)!=tuple:
            inputs = nn.parallel.scatter(x, self.gpus)
        else:
            inputs, parallel_type = x
#         if parallel_type == 'dp':
        if parallel_type == 'mp':
            bs = inputs[0].shape[0]
            bulk = bs//len(self.gpus)
            outputs = [self.convs[i](inputs[i][i*bulk:(i+1)*bulk,:,:,:]) 
                       for i in range(len(self.gpus)-1)]
            outputs.append(self.convs[-1](
                inputs[-1][bulk*(len(self.gpus)-1):,:,:,:]))
        else:
            outputs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
        return outputs, 'dp'

## Situation to consider

![](fig/hypar_connection.png)

#### Baseline

In [4]:
in_data = torch.randn(16,8,7,7)
conv1 = nn.Conv2d(8,8,3).cuda(2)
conv2 = nn.Conv2d(8,8,3).cuda(2)

In [5]:
a = conv1(in_data.cuda(2))
b = conv2(a)
a_sum = a.sum()
b_sum = b.sum()
a_sum, b_sum

(tensor(-36.9450, device='cuda:2', grad_fn=<SumBackward0>),
 tensor(-36.1029, device='cuda:2', grad_fn=<SumBackward0>))

In [6]:
b_sum.backward()
conv1.weight.grad.sum(), conv2.weight.grad.sum()

(tensor(-36.6203, device='cuda:2'), tensor(-1466.1299, device='cuda:2'))

#### dp-dp test

In [7]:
dp_conv1 = data_parallel_conv2d(conv1, [0,1])
dp_conv2 = data_parallel_conv2d(conv2, [0,1])

In [8]:
dp_a = dp_conv1(in_data.cuda(0))
dp_b = dp_conv2(dp_a)
dp_a_sum = (dp_a[0][0] + dp_a[0][1].cuda(0)).sum()
dp_b_sum = (dp_b[0][0] + dp_b[0][1].cuda(0)).sum()
dp_a_sum, dp_b_sum

(tensor(-36.9450, device='cuda:0', grad_fn=<SumBackward0>),
 tensor(-36.1029, device='cuda:0', grad_fn=<SumBackward0>))

In [9]:
dp_b_sum.backward()
dp_conv1.convs[0].weight.grad.sum() + dp_conv1.convs[1].weight.grad.sum().cuda(0),\
    dp_conv2.convs[0].weight.grad.sum() + dp_conv2.convs[1].weight.grad.sum().cuda(0)

(tensor(-36.6203, device='cuda:0'), tensor(-1466.1299, device='cuda:0'))

#### Pass!

#### mp-mp test

In [10]:
mp_conv1 = model_parallel_conv2d(conv1, [0,1])
mp_conv2 = model_parallel_conv2d(conv2, [0,1])

In [11]:
mp_a = mp_conv1(in_data.cuda(0))
mp_b = mp_conv2(mp_a)
(mp_a[0][0]).sum(), (mp_b[0][0]).sum(), 

(tensor(-36.9450, device='cuda:0', grad_fn=<SumBackward0>),
 tensor(-36.1029, device='cuda:0', grad_fn=<SumBackward0>))

In [12]:
(mp_b[0][0]).sum().backward()

In [13]:
(mp_conv1.convs[0].weight.grad.sum()+mp_conv1.convs[1].weight.grad.sum().cuda(0)),\
    (mp_conv2.convs[0].weight.grad.sum() + mp_conv2.convs[1].weight.grad.sum().cuda(0))

(tensor(-36.6203, device='cuda:0'), tensor(-1466.1299, device='cuda:0'))

#### Pass!

#### mp-dp test

In [14]:
mp_conv1 = model_parallel_conv2d(conv1, [0,1])
dp_conv2 = data_parallel_conv2d(conv2, [0,1])

In [15]:
mp_a = mp_conv1(in_data.cuda(0))
dp_b = dp_conv2(mp_a)

In [16]:
dp_b_sum = (dp_b[0][0]+dp_b[0][1].cuda(0)).sum()
(mp_a[0][0]).sum(), dp_b_sum

(tensor(-36.9450, device='cuda:0', grad_fn=<SumBackward0>),
 tensor(-36.1029, device='cuda:0', grad_fn=<SumBackward0>))

In [17]:
dp_b_sum.backward()

In [18]:
(mp_conv1.convs[0].weight.grad.sum()+mp_conv1.convs[1].weight.grad.sum().cuda(0)),\
    (dp_conv2.convs[0].weight.grad.sum() + dp_conv2.convs[1].weight.grad.sum().cuda(0))

(tensor(-36.6203, device='cuda:0'), tensor(-1466.1298, device='cuda:0'))

#### Pass!

#### dp-mp test

In [19]:
dp_conv1 = data_parallel_conv2d(conv1, [0,1])
mp_conv2 = model_parallel_conv2d(conv2, [0,1])

In [20]:
dp_a = dp_conv1(in_data.cuda(0))
mp_b = mp_conv2(dp_a)

In [21]:
(dp_a[0][0]+dp_a[0][1].cuda(0)).sum(), (mp_b[0][0]).sum()

(tensor(-36.9450, device='cuda:0', grad_fn=<SumBackward0>),
 tensor(-36.1029, device='cuda:0', grad_fn=<SumBackward0>))

In [22]:
(mp_b[0][0]).sum().backward()

In [23]:
(dp_conv1.convs[0].weight.grad.sum()+dp_conv1.convs[1].weight.grad.sum().cuda(0)),\
    (mp_conv2.convs[0].weight.grad.sum() + mp_conv2.convs[1].weight.grad.sum().cuda(0))

(tensor(-36.6203, device='cuda:0'), tensor(-1466.1299, device='cuda:0'))

#### Pass!