## 예제 3-5) 사람의 손글씨 데이터인 MNIST를 이용해 MLP 설계할 때 Dropout & ReLU & Batch Normalization & He Uniform Initialization & Adam 적용해보기

다양한 요소를 조절해봤는데, 그중 가장 중요한 것이 바로 학습에 이용되는 Optimizer.  
Optimizer 중에서도 가장 자주 이용되는 Adamdmf 이용해 실습해보자

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn                           
import torch.nn.functional as F
from torchvision import transforms, datasets 

In [2]:
if torch.cuda.is_available() :
    DEVICE = torch.device('cuda')
else :
    DEVICE = torch.device('cpu')
    
print('Using PyTorch version : ', torch.__version__, ' Device : ', DEVICE)

Using PyTorch version :  1.9.0  Device :  cpu


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
BATCH_SIZE = 32
EPOCHS = 10

In [4]:
train_dataset = datasets.MNIST(root = "../data/MNIST",
                              train = True,
                              download = True,
                              transform = transforms.ToTensor())
test_dataset = datasets.MNIST(root = "../data/MNIST",
                              train = False,
                              transform = transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                          batch_size = BATCH_SIZE,
                                          shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                          batch_size = BATCH_SIZE,
                                          shuffle = False)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


nn.BatchNorm( ) 함수를 적용하는 부분은 논문 / 코드에 따라 Activation Function 전, 후가 달라질 수 있음.  
이 예제에서는 이전에 적용해보자.

In [5]:
class Net(nn.Module) : 
    def __init__(self) :  
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)
        self.dropout_prob = 0.5
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.batch_norm2 = nn.BatchNorm1d(256)
        
    def forward(self, x) :
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = F.dropout(x, training = self.training, p = self.dropout_prob)
        x = self.fc2(x)
        x = self.batch_norm2(x)
        x = F.relu(x)
        x = F.dropout(x, training = self.training, p = self.dropout_prob)
        x = self.fc3(x)
        x = F.log_softmax(x, dim = 1)
        return x

In [6]:
import torch.nn.init as init

In [7]:
def weight_init(m) :                              
    if isinstance(m, nn.Linear) :                 
        init.kaiming_uniform_(m.weight.data)      

In [8]:
model = Net().to(DEVICE)
model.apply(weight_init)                                                  
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)  # (1)
criterion = nn.CrossEntropyLoss()

print(model)

Net(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=10, bias=True)
  (batch_norm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


(1) optimizer 정의는 한 줄만 변경하면 됨. Adam은 RMSProp + Momentum으로 다양한 optimizer 중 기본적으로 자주 사용 됨.

In [9]:
def train(model, train_loader, optimizer, log_interval) :
    model.train()            
    for batch_idx, (image, label) in enumerate(train_loader) :
        image = image.to(DEVICE)                                                               
        label = label.to(DEVICE)                                                               
        optimizer.zero_grad()                                                                  
        output = model(image)                                                                  
        loss = criterion(output, label)                                                        
        loss.backward()                                                                        
        optimizer.step()                                                                       
        
        if batch_idx % log_interval == 0 :
            print("Train Eppoch : {} [{}/{}({:.0f}%)]\tTrain Loss : {:.6f}".format(
                Epoch, batch_idx * len(image), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [10]:
def evaluate(model, test_loader) :
    model.eval()
    test_loss = 0
    correct = 0
    
    with torch.no_grad() :
        for image, label in test_loader :
            image = image.to(DEVICE)
            label = label.to(DEVICE)
            output = model(image)  
            test_loss += criterion(output, label).item()
            prediction = output.max(1, keepdim = True)[1]
            correct += prediction.eq(label.view_as(prediction)).sum().item() 
            
    test_loss /= len(test_loader.dataset) 
    test_accuracy = 100. * correct / len(test_loader.dataset) 
    return test_loss, test_accuracy

In [11]:
for Epoch in range(1, EPOCHS + 1) :
    train(model, train_loader, optimizer ,log_interval = 200)    # (1)
    test_loss, test_accuracy = evaluate(model, test_loader)      # (2)
    print("\n[EPOCH : {}], \tTest Loss : {:.4f}, \tTest Accuracy : {:.2f} %\n".format(Epoch, test_loss, test_accuracy))


[EPOCH : 1], 	Test Loss : 0.0039, 	Test Accuracy : 96.03 %


[EPOCH : 2], 	Test Loss : 0.0032, 	Test Accuracy : 96.68 %


[EPOCH : 3], 	Test Loss : 0.0030, 	Test Accuracy : 97.06 %


[EPOCH : 4], 	Test Loss : 0.0030, 	Test Accuracy : 96.89 %


[EPOCH : 5], 	Test Loss : 0.0026, 	Test Accuracy : 97.45 %


[EPOCH : 6], 	Test Loss : 0.0023, 	Test Accuracy : 97.95 %


[EPOCH : 7], 	Test Loss : 0.0025, 	Test Accuracy : 97.59 %


[EPOCH : 8], 	Test Loss : 0.0024, 	Test Accuracy : 97.74 %


[EPOCH : 9], 	Test Loss : 0.0021, 	Test Accuracy : 97.95 %


[EPOCH : 10], 	Test Loss : 0.0022, 	Test Accuracy : 97.88 %

