# 6-1 マルチGPU

Single,Dual/QuadでのCIFAR10の学習時間を比較する。<br>
PyTorchでやった

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

import time

## データの準備

In [2]:
# proxy
import os
os.environ["http_proxy"] = "http://proxy.uec.ac.jp:8080/"
os.environ["https_proxy"] = "http://proxy.uec.ac.jp:8080/"

transform = transforms.Compose([transforms.ToTensor()])
#transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# データセットをダウンロード
batch_size = 512
train_set = CIFAR10('./data/CIFAR10', train=True, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)

test_set = CIFAR10('./data/CIFAR10', train=False, download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


## CNNの定義

In [3]:
cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
    def __init__(self, vgg_name):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name])
        self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)

    
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        input = x
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = VGG('VGG19')

## 損失関数

In [4]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

## GPU

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = nn.DataParallel(net)
net = net.to(device)
# print(net)

## 実行

In [6]:
net.train()
num_epochs = 20
dataset_sizes = train_set.__len__()

print('# mnibatch size : {}'.format(batch_size))
print('# epoch : {}'.format(num_epochs))
print()

start = time.time()
for epoch in range(num_epochs):
    print('Epoch {}  '.format(epoch+1), end='')
    
    running_loss = 0.0
    running_corrects = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)
        
    epoch_loss = running_loss / dataset_sizes
    epoch_acc = running_corrects.double() / dataset_sizes
    
    print('loss:{:.4f} acc:{:.4f}'.format(epoch_loss, epoch_acc))

end = time.time()
print()
print('time: {}'.format(end-start) + ' [sec]')

# mnibatch size : 512
# epoch : 20

Epoch 1  loss:1.6747 acc:0.3867
Epoch 2  loss:1.1719 acc:0.5780
Epoch 3  loss:0.9409 acc:0.6666
Epoch 4  loss:0.7576 acc:0.7345
Epoch 5  loss:0.6056 acc:0.7915
Epoch 6  loss:0.4751 acc:0.8366
Epoch 7  loss:0.3687 acc:0.8748
Epoch 8  loss:0.2686 acc:0.9106
Epoch 9  loss:0.1975 acc:0.9368
Epoch 10  loss:0.1336 acc:0.9574
Epoch 11  loss:0.1009 acc:0.9689
Epoch 12  loss:0.0822 acc:0.9751
Epoch 13  loss:0.0740 acc:0.9769
Epoch 14  loss:0.0666 acc:0.9788
Epoch 15  loss:0.0516 acc:0.9843
Epoch 16  loss:0.0459 acc:0.9859
Epoch 17  loss:0.0346 acc:0.9898
Epoch 18  loss:0.0241 acc:0.9936
Epoch 19  loss:0.0116 acc:0.9978
Epoch 20  loss:0.0069 acc:0.9990

time: 283.86269307136536 [sec]


## 評価

In [7]:
net.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))

Accuracy of the network on the 10000 test images: 72 %


### Batch Size: 256 (epoch:3)

single : 79.15005564689636 <br>
dual   : 147.88250350952148 <br>


### Batch Size: 512 (epoch:20)

single : time: 327.1954674720764 [sec] Accuracy of the network on the 10000 test images: 72 % <br>
dual   : time: 306.4763495922088 [sec] Accuracy of the network on the 10000 test images: 73 % <br>
quad   : time: 283.8626930713653 [sec] Accuracy of the network on the 10000 test images: 72 % <br>