### acc 85% 이상으로 구현해보자

# Convolutional AutoEncoder (CAE) + Classifier 

- Hybrid Model : CAE 를 통한 image reconstructor와 CAE 내부의 encoder를 통해 classification을 동시에 진행 -> 성능 향상

        다중 작업 학습(Multi-Task Learning) 이라고 칭하기도 함

- 구조 : CAE 의 encoder를 공유하여 학습 데이터(image) 를 latent vector로 압축하고, latent vector를 decoder와 classifer가 공유함

In [1]:
import torch

from torchvision.transforms import Compose, ToTensor, Normalize, RandomHorizontalFlip, RandomCrop, ColorJitter, RandomRotation
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torch.nn import Module, Sequential, Linear, Conv2d, ReLU, MaxPool2d, ConvTranspose2d, Tanh, MSELoss, CrossEntropyLoss, Flatten, BatchNorm2d, Dropout
from torch.optim import Adam, lr_scheduler

from time import time
from math import floor

In [2]:
# transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2) : 색상 무작위 변경 (밝기, 대비, 채도)
# transforms.RandomRotation(15) : 무작위 회전 (-15 ~ 15도)
transform_train = Compose([
    RandomHorizontalFlip(),
    RandomCrop(32, padding=4),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    RandomRotation(15),
    ToTensor(),
    Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train = CIFAR10(root='data_cifar10', train=True, download=True, transform=transform_train)

In [None]:
transform_test = Compose([
    ToTensor(),
    Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test = CIFAR10(root='data_cifar10', train=False, download=True, transform=transform_test)

In [4]:
batch_size = 256
trainset = DataLoader(train, batch_size=128, shuffle=True)
testset = DataLoader(test, batch_size=128, shuffle=True)

In [None]:
class ConvolutionalAutoEncoder(Module):
    def __init__(self):
        super().__init__()

        latent_space = 256

        self.encoder = Sequential(
            # 32 * 32 * 3 -> 32 * 32 * 32 (channel 증가 : 3-> 32)
            Conv2d(3, 32, kernel_size=3, padding=1),
            BatchNorm2d(32),
            ReLU(),
            # 16 * 16 * 32 (image downsampling)
            MaxPool2d(2, 2),

            # 16 * 16 * 64 (channel 증가 32 -> 64)
            Conv2d(32, 64, kernel_size=3, padding=1),
            BatchNorm2d(64),
            ReLU(),
            # 8 * 8 * 64 (image downsampling)
            MaxPool2d(2, 2),

            # 8 * 8 * 128 (channel 증가 : 64 -> 128)
            Conv2d(64, 128, kernel_size=3, padding=1),
            BatchNorm2d(128),
            ReLU(),
            # 4 * 4 * 128 (image downsampling)
            MaxPool2d(2, 2),
            
            # 4 * 4 * 256 (channel 증가 : 128 -> 256)
            Conv2d(128, 256, kernel_size=3, padding=1),
            BatchNorm2d(256),
            ReLU(),
            # 2 * 2 * 256 (image downsampling)
            MaxPool2d(2, 2)
        )
        self.encoder_linear = Sequential(
            # latent layer로 입력하기 위해 flatten
            Flatten(),
            Linear(2 * 2 * 256, latent_space),
            ReLU()
        )

        self.decoder_linear = Sequential(
            # latent layer에 입력되던 image shape으로 변경
            Linear(latent_space, 2 * 2 * 256),
            ReLU()
        )
        
        self.decoder = Sequential(
            # ConvTranspose2d : 전치 합성곱 (pooling으로 줄어든 image를 upsampling -> convolution을 거꾸로)
            # 2 * 2 * 256 -> 4 * 4 * 128
            ConvTranspose2d(256, 128, kernel_size=2, stride=2),
            BatchNorm2d(128),
            ReLU(),
            # 4 * 4 * 128 -> 8 * 8 * 64
            ConvTranspose2d(128, 64, kernel_size=2, stride=2),
            BatchNorm2d(64),
            ReLU(),
            # 8 * 8 * 64 -> 16 * 16  * 32
            ConvTranspose2d(64, 32, kernel_size=2, stride=2),
            BatchNorm2d(32),
            ReLU(),
            # 16 * 16  * 32 -> 32 * 32 * 3
            ConvTranspose2d(32, 3, kernel_size=2, stride=2),
            # 마지막 convTranspose에서는 batch normalize 사용 x
            Tanh()        
        )

        self.classifier = Sequential(
            Linear(latent_space, 512),
            ReLU(),
            Dropout(0.3),
            Linear(512, 256),
            ReLU(),
            Dropout(0.3),
            Linear(256, 10)
        )

    def forward(self, x):
        feature_map = self.encoder(x)
        latent_vector = self.encoder_linear(feature_map)
        # classification    
        logits = self.classifier(latent_vector)
        
        decoder_input = self.decoder_linear(latent_vector)
        # reconstruction
        # .view(batch_size, channel, height, weight) -> -1 을 통해 batch size는 자동으로
        reconstructed_img = self.decoder(decoder_input.view(-1, 256, 2, 2))
        
        # 오토인코더 학습(image reconstruct)과 분류 학습(classification)을 위해 두 결과를 모두 반환
        return reconstructed_img, logits

In [6]:
model = ConvolutionalAutoEncoder()

In [7]:
# encoder 에서 나온 출력을 가지고 분류
classification_loss_function = CrossEntropyLoss()
# decoder 에서 나온 출력을 가지고 학습
reconstruction_loss_function = MSELoss()

learning_rate = 0.001
# weight_decay : L2 정규화 
optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=0.000001)
# step_size=10 : 10 epoch 마다
# gamma=0.5 : learning_rate를 50% 줄이자 (0.001 -> 0.0005 -> 0.00025 ...) -> fine-tuning (학습을 세밀하게 조정)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [None]:
epochs = 50

# 각각의 loss에 가중치를 곱하기 위함 (더해서 1이 되도록)
# 분류에 가중치를 더 많이 주자
classification_weights = 0.9
reconstruction_weights = 0.1

total_time = list()
for epoch in range(epochs):
    now = time()
    avg_loss = 0.0

    for i, data in enumerate(trainset):
        x_train, y_train = data

        optimizer.zero_grad()

        reconstructed_img, logits = model(x_train)
        # 각각의 loss 계산
        class_loss = classification_loss_function(logits, y_train)
        recon_loss = reconstruction_loss_function(reconstructed_img, x_train)
        # 계산된 loss를 가중치로 곱하여, 어떤 작업을 중점으로 하여 학습할지를 결정
        total_loss = (classification_weights * class_loss) + (reconstruction_weights * recon_loss)

        total_loss.backward()

        optimizer.step()

        avg_loss += total_loss.item()
    
    scheduler.step()

    learning_time = int(time() - now)
    total_time.append(learning_time)

    print(f"epoch: {epoch+1:3d}/{epochs} \t Loss: {avg_loss/len(trainset):.3f} \t {learning_time}s")

print(f"learning time ({epochs} epoch) : {floor(sum(total_time) / 60)}m {sum(total_time) % 60}")

epoch: 1 	 Loss: 1.452 	 56s
epoch: 2 	 Loss: 1.119 	 58s
epoch: 3 	 Loss: 0.993 	 59s
epoch: 4 	 Loss: 0.895 	 60s
epoch: 5 	 Loss: 0.833 	 64s
epoch: 6 	 Loss: 0.783 	 62s
epoch: 7 	 Loss: 0.750 	 63s
epoch: 8 	 Loss: 0.723 	 60s
epoch: 9 	 Loss: 0.693 	 62s
epoch: 10 	 Loss: 0.671 	 60s
epoch: 11 	 Loss: 0.601 	 60s
epoch: 12 	 Loss: 0.576 	 60s
epoch: 13 	 Loss: 0.565 	 60s
epoch: 14 	 Loss: 0.553 	 62s
epoch: 15 	 Loss: 0.545 	 68s
epoch: 16 	 Loss: 0.528 	 73s
epoch: 17 	 Loss: 0.528 	 62s
epoch: 18 	 Loss: 0.515 	 68s
epoch: 19 	 Loss: 0.507 	 73s
epoch: 20 	 Loss: 0.502 	 74s
epoch: 21 	 Loss: 0.465 	 63s
epoch: 22 	 Loss: 0.455 	 58s
epoch: 23 	 Loss: 0.451 	 59s
epoch: 24 	 Loss: 0.445 	 58s
epoch: 25 	 Loss: 0.437 	 59s
epoch: 26 	 Loss: 0.440 	 59s
epoch: 27 	 Loss: 0.430 	 72s
epoch: 28 	 Loss: 0.429 	 71s
epoch: 29 	 Loss: 0.424 	 75s
epoch: 30 	 Loss: 0.421 	 64s
epoch: 31 	 Loss: 0.400 	 62s
epoch: 32 	 Loss: 0.397 	 67s
epoch: 33 	 Loss: 0.391 	 68s
epoch: 34 	 Loss: 0

In [9]:
test_iter = iter(testset)
x_test, y_test = next(test_iter)

In [10]:
name_list = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

_, predict = model(x_test)
_, predict_labels = torch.max(predict, 1)

print(list(map(lambda x: name_list[x], predict_labels)))
print(list(map(lambda x: name_list[x], y_test)))

['bird', 'frog', 'dog', 'plane', 'horse', 'car', 'ship', 'car', 'truck', 'deer', 'bird', 'dog', 'plane', 'cat', 'car', 'plane', 'plane', 'truck', 'ship', 'dog', 'frog', 'deer', 'car', 'deer', 'truck', 'horse', 'dog', 'bird', 'frog', 'dog', 'horse', 'ship', 'ship', 'frog', 'truck', 'bird', 'truck', 'ship', 'truck', 'plane', 'deer', 'truck', 'dog', 'deer', 'deer', 'plane', 'cat', 'truck', 'car', 'deer', 'plane', 'horse', 'ship', 'deer', 'cat', 'truck', 'bird', 'deer', 'plane', 'car', 'plane', 'ship', 'plane', 'bird', 'truck', 'car', 'ship', 'dog', 'dog', 'cat', 'dog', 'ship', 'plane', 'horse', 'dog', 'bird', 'truck', 'horse', 'ship', 'cat', 'truck', 'bird', 'frog', 'cat', 'frog', 'frog', 'horse', 'cat', 'frog', 'horse', 'cat', 'plane', 'dog', 'horse', 'plane', 'dog', 'cat', 'horse', 'frog', 'cat', 'plane', 'car', 'ship', 'ship', 'cat', 'ship', 'ship', 'bird', 'deer', 'horse', 'horse', 'bird', 'plane', 'horse', 'deer', 'frog', 'plane', 'horse', 'plane', 'deer', 'truck', 'ship', 'truck', '

In [11]:
correct = 0
total = 0

model.eval()

with torch.no_grad():
    for data in testset:
        x_test, y_test = data
        _, h = model(x_test)
        _, predicted = torch.max(h.data, 1)
        total += y_test.size(0)
        correct += (predicted == y_test).sum().item()

print(f'acc : {100 * correct / total:.2f}%')

acc : 85.27%
