# デバイスによる演算結果の違いについての検証

mpsを使うと学習が上手くいかないので検証する．

適当にGANを作ってみる


---

## 準備

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import torchvision
from torchvision.datasets import MNIST
from torchvision import transforms
from IPython.display import display

batch_size = 64
nz = 10

In [2]:
def make_noise(batch_size, device):
    return torch.randn(batch_size, nz, 1, 1).to(device)

def write(netG, device, n_rows=1, n_cols=8, size=64):
    z = make_noise(n_rows*n_cols, device)
    images = netG(z)
    images = transforms.Resize(size)(images)
    img = torchvision.utils.make_grid(images, n_cols)
    img = transforms.functional.to_pil_image(img)
    display(img)

### MNIST

In [3]:
dataset = MNIST(
    root="data/",
    train=True,
    download=True,
    transform=transforms.ToTensor()
)

dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

sample_x, _ = next(iter(dataloader))
print("batch shape: ", sample_x.shape)

batch shape:  torch.Size([64, 1, 28, 28])


### Discriminator

In [4]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            self._conv(1, 16, 4, 2, 1),
            self._conv(16, 32, 4, 2, 1),
            self._conv(32, 64, 3, 2, 0),
            nn.Conv2d(64, 128, 3, 1, 0),
            nn.Flatten(),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def _conv(self, in_channels, out_channels, kernel_size, stride, padding):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.2)
        )

    def forward(self, x):
        y = self.net(x)
        return y

### Generator

In [5]:
class Generator(nn.Module):
    def __init__(self, nz):
        super().__init__()
        self.net = nn.Sequential(
            self._convT(nz, 128, 3, 1, 0),
            self._convT(128, 64, 3, 2, 0),
            self._convT(64, 32, 4, 2, 1),
            nn.ConvTranspose2d(32, 1, 4, 2, 1),
            nn.Sigmoid()
        )

    def _convT(self, in_channels, out_channels, kernel_size, stride, padding):
        return nn.Sequential(
            nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride, padding),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )

    def forward(self, x):
        y = self.net(x)
        return y


---

## 学習 (CPU)

In [6]:
torch.manual_seed(124)

device = torch.device("cpu")
device

device(type='cpu')

In [7]:
netD = Discriminator().to(device)
netG = Generator(nz).to(device)
optimD = optim.Adam(netD.parameters(), lr=0.0002)
optimG = optim.Adam(netG.parameters(), lr=0.0002)

fake_labels = torch.zeros(batch_size, 1).to(device)
real_labels = torch.ones(batch_size, 1).to(device)
criterion = nn.BCELoss()

In [8]:
pre_netG_weight_cpu = netG.net[0][0].weight[0, 0].detach().clone()
pre_netD_weight_cpu = netD.net[0][0].weight[0, 0].detach().clone()

In [9]:
lim = 100

for i, (X, _) in enumerate(dataloader):
    X = X.to(device)
    optimD.zero_grad()
    optimG.zero_grad()

    z = make_noise(batch_size, device)
    fake = netG(z)
    pred_fake = netD(fake)
    pred_real = netD(X)
    loss_fake = criterion(pred_fake, fake_labels)
    loss_real = criterion(pred_real, real_labels)
    lossD = loss_fake + loss_real
    lossD.backward()
    optimD.step()

    fake = netG(z)
    pred = netD(fake)
    lossG = criterion(pred, real_labels)
    lossG.backward()
    optimG.step()

    if i >= lim:
        break

In [10]:
netG_weight_cpu = netG.net[0][0].weight[0, 0].detach().clone()
netD_weight_cpu = netD.net[0][0].weight[0, 0].detach().clone()


---

## 学習 (mps)

In [11]:
torch.manual_seed(124)

device = torch.device("mps")
device

device(type='mps')

In [12]:
netD = Discriminator().to(device)
netG = Generator(nz).to(device)
optimD = optim.Adam(netD.parameters(), lr=0.0002)
optimG = optim.Adam(netG.parameters(), lr=0.0002)

fake_labels = torch.zeros(batch_size, 1).to(device)
real_labels = torch.ones(batch_size, 1).to(device)
criterion = nn.BCELoss()

In [13]:
pre_netG_weight_mps = netG.net[0][0].weight[0, 0].detach().clone()
pre_netD_weight_mps = netD.net[0][0].weight[0, 0].detach().clone()

In [14]:
lim = 10

for i, (X, _) in enumerate(dataloader):
    X = X.to(device)
    optimD.zero_grad()
    optimG.zero_grad()

    z = make_noise(batch_size, device)
    fake = netG(z)
    pred_fake = netD(fake)
    pred_real = netD(X)
    loss_fake = criterion(pred_fake, fake_labels)
    loss_real = criterion(pred_real, real_labels)
    lossD = loss_fake + loss_real
    lossD.backward()
    optimD.step()

    fake = netG(z)
    pred = netD(fake)
    lossG = criterion(pred, real_labels)
    lossG.backward()
    optimG.step()

    if i >= lim:
        break

In [15]:
netG_weight_mps = netG.net[0][0].weight[0, 0].detach().clone()
netD_weight_mps = netD.net[0][0].weight[0, 0].detach().clone()


---

## 結果

CPUとmpsで学習したモデルの重み（の一部）を比較する

In [16]:
def format_weights(weights):
    return " ".join(map(lambda x: f"{x:>7.4f}", weights.ravel().tolist()))
line = lambda : print('-'*150)

print("学習前の生成器")
print("CPU: ", format_weights(pre_netG_weight_cpu))
print("mps: ", format_weights(pre_netG_weight_mps))

line()

print("学習後の生成器")
print("CPU: ", format_weights(netG_weight_cpu))
print("mps: ", format_weights(netG_weight_mps))

line()

print("学習前の識別器")
print("CPU: ", format_weights(pre_netD_weight_cpu))
print("mps: ", format_weights(pre_netD_weight_mps))

line()

print("学習後の識別器")
print("CPU: ", format_weights(netD_weight_cpu))
print("mps: ", format_weights(netD_weight_mps))

学習前の生成器
CPU:   0.0043 -0.0239 -0.0005  0.0243  0.0112  0.0217  0.0083 -0.0101  0.0203
mps:   0.0043 -0.0239 -0.0005  0.0243  0.0112  0.0217  0.0083 -0.0101  0.0203
------------------------------------------------------------------------------------------------------------------------------------------------------
学習後の生成器
CPU:   0.0045 -0.0292  0.0041  0.0176  0.0227  0.0148  0.0046 -0.0038  0.0267
mps:   0.0029 -0.0257 -0.0005  0.0230  0.0115  0.0220  0.0079 -0.0113  0.0211
------------------------------------------------------------------------------------------------------------------------------------------------------
学習前の識別器
CPU:  -0.1737  0.0656  0.1703 -0.0129  0.0061 -0.1885 -0.0951 -0.0168 -0.0076 -0.1392 -0.1794 -0.0265  0.2492  0.2102 -0.1292  0.0305
mps:  -0.1737  0.0656  0.1703 -0.0129  0.0061 -0.1885 -0.0951 -0.0168 -0.0076 -0.1392 -0.1794 -0.0265  0.2492  0.2102 -0.1292  0.0305
----------------------------------------------------------------------------------------------

学習前はちゃんと同じ値になっているが，学習後は少し誤差が生じている．

学習が上手くいかないのは，この誤差が結構致命的なものになっているからなのかな．

<br>

GPUによる演算の誤差について
- [ChainerやTensorFlowでGPUを使うと毎回結果が変わる理由と対策 (まとめ) - Qiita](https://qiita.com/TokyoMickey/items/63c4053740ab1f3f28a2a)