# 硬件

```python
for i in range(len(a)):
    c[i] = b[i] + a[i]

c = a + b
```

后者比前者更优：
- 前者需要调用n次函数，每次均有开销
- 后者更容易被并行执行

```c++
#pragma omp for
for (int i = 0; i < a.size(); i++) 
    c[i] = a[i] + b[i]
```

- TPU:使用脉动矩阵加速运算
- 多GPU:模型并行&数据并行

# QA

- 增加数据是提高泛化性的最优手段，高质量的数据
- 框架会要求显示表示数据在GPU还是CPU
- 复现论文：注重每一句话，但是只有20%才能实现
    - 看看别人实现的代码，通常与论文不同

In [3]:
%matplotlib inline
import torch
from torch import nn
from torch.nn import functional as F
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inline


In [5]:
scale = 0.01
# kernel_size=3x3 in=1 out=20
W1 = torch.randn(size=(20, 1, 3, 3)) * scale
b1 = torch.zeros(20)
W2 = torch.randn(size=(50, 20, 5, 5)) * scale
b2 = torch.zeros(50)
W3 = torch.randn(size=(800, 128)) * scale
b3 = torch.zeros(128)
W4 = torch.randn(size=(128, 10)) * scale
b4 = torch.zeros(10)
param = [W1, b1, W2, b2, W3, b3, W4, b4]

def lenet(X, params):
    h1_conv = F.conv2d(input=X, weight=params[0], bias=params[1])
    h1_activation = F.relu(h1_conv)
    h1 = F.avg_pool2d(input=h1_activation, kernel_size=(2, 2), stride=2)
    h2_conv = F.conv2d(input=h1, weight=params[2], bias=params[3])
    h2_activation = F.relu(h2_conv)
    h2 = F.avg_pool2d(input=h2_activation, kernel_size=(2, 2), stride=2)

    h2 = h2.reshape([h2.shape[0], -1])
    h3 = F.relu(torch.mm(h2, params[4]) + params[5])
    y_hat = torch.mm(h3, params[6]) + params[7]
    return y_hat

loss = nn.CrossEntropyLoss(reduction="none")

In [7]:
def get_params(params, device):
    new_params = [p.clone().to(device) for p in params]
    for p in new_params:
        p.requires_grad_()
    return new_params

def try_gpu(i=0):
    if torch.cuda.device_count() >= i+1:
        return torch.device(f"cuda:{i}")
    else:
        return torch.device("cpu")

new_params = get_params(param, try_gpu(0))
print("b1 weight:", new_params[1])
print("b1 grad:", new_params[1].grad)

b1 weight: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0', requires_grad=True)
b1 grad: None


In [11]:
def allreduce(data):
    for i in range(1, len(data)):
        data[0][:] += data[i].to(data[0].device) # [:] 原地操作
    for i in range(1, len(data)):
        data[i] = data[0].to(data[i].device)

# data = [torch.ones((1, 2), dtype=torch.float32, device=try_gpu(i)) * (i + 1) for i in range(2)]
device = [torch.device('cpu'), try_gpu()]
data = [torch.ones((1, 2), dtype=torch.float32, device=d) * (i + 1) for i, d in enumerate(device)]
print(data)
allreduce(data)
print(data)

[tensor([[1., 1.]]), tensor([[2., 2.]], device='cuda:0')]
[tensor([[3., 3.]]), tensor([[3., 3.]], device='cuda:0')]


In [17]:
# 批量切开
data = torch.arange(20).reshape(4, 5)
# scatter 不能有cpu
# split = torch.nn.parallel.scatter(data, device)
def scatter(data, device):
    num_device = len(device)
    assert data.shape[0] % num_device == 0
    length = data.shape[0] // num_device
    y = []
    for i in range(len(device)):
        y.append(data[i*length:(i+1)*length, :].to(device[i]))
    return y
split = scatter(data, device)
print(split)

[tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]]), tensor([[10, 11, 12, 13, 14],
        [15, 16, 17, 18, 19]], device='cuda:0')]


In [18]:
def split_batch(X, y, devices):
    assert X.shape[0] == y.shape[0]
    return (scatter(X, devices), scatter(y, devices))

In [None]:
def train(X, y, device_params, devices, lr):
    X_shards, y_shards = split_batch(X, y, devices)
    ls = [
        loss(lenet(lenet(X_shard, device_w), y_shard)).sum 
        for X_shard, y_shard, device_w in zip(X_shards, y_shards, device_params)
    ]
    for l in ls:
        l.backward()
    with torch.no_grad():
        for i in range(len(device_params[0])):
            # 对每一层的参数对每一个device更新
            allreduce(
                [device_params[d][i] for d in range(len(device))]
            )

# 简洁实现
```python
net = nn.DataParallel(net, device_ids = devices)
```

## 目前应当使用DDP而不是DP!!

- 理论上，num_gpu * n, batch_size * n, lr * n
- 分布式需要考虑带宽
- batch_size=1时，理论精度最佳
- batch_size过大时，需要考虑lr的优化算法
- 一般来说，$\text{batch-size}\sim 10\times\text{num-classes}$