# M2608.001300 Machine Learning<br> Assignment #5 Final Projects (Pytorch)


Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them.

**For understanding of this work, please carefully look at given PPT file.**

Note: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

import os
import random

import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable
from PIL import Image
import resnet

Load datasets


In [2]:
NUMBER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
ALPHABET = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
NONE = ['NONE'] # label for empty space
ALL_CHAR_SET = NUMBER + ALPHABET + NONE
ALL_CHAR_SET_LEN = len(ALL_CHAR_SET)
MAX_CAPTCHA = 7

print(ALL_CHAR_SET.index('NONE'))

def encode(a):
    onehot = [0]*ALL_CHAR_SET_LEN
    idx = ALL_CHAR_SET.index(a)
    onehot[idx] += 1
    return onehot

# modified dataset class
class Mydataset(Dataset):
    def __init__(self, img_path, label_path, is_train=True, transform=None):
        self.path = img_path
        self.label_path = label_path
        if is_train: 
            self.img = os.listdir(self.path)[:1000]
            self.labels = open(self.label_path, 'r').read().split('\n')[:-1][:1000]
        else: 
            self.img = os.listdir(self.path)[:1000]
            self.labels = open(self.label_path, 'r').read().split('\n')[:-1][:1000]
        
        self.transform = transform
        self.max_length = MAX_CAPTCHA
        
    def __getitem__(self, idx):
        img_path = self.img[idx]
        img = Image.open(f'{self.path}/{self.img[idx]}')
        img = img.convert('L')
        label = self.labels[idx]
        label_oh = []
        # one-hot for each character
        for i in range(self.max_length):
            if i < len(label):
                label_oh += encode(label[i])
            else:
                #label_oh += [0]*ALL_CHAR_SET_LEN
                label_oh += encode('NONE')
            
        if self.transform is not None:
            img = self.transform(img)
        return img, np.array(label_oh), label
    
    def __len__(self):
        return len(self.img)

transform = transforms.Compose([
    transforms.Resize([160, 60]),
    transforms.ToTensor(),
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
# transforms.Normalize((0.1307, ), (0.3081, ))

##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################
])



36


In [3]:
"""Loading DATA"""
# Change to your own data folder path!
# gPath = '/content/drive/My Drive/Colab Notebooks/'
gPath = './'

train_ds = Mydataset(gPath+'Data/train/', gPath+'Data/train.txt',transform=transform)
test_ds = Mydataset(gPath+'Data/test/', gPath+'Data/test.txt', False, transform)
train_dl = DataLoader(train_ds, batch_size=128, num_workers=4)
test_dl = DataLoader(test_ds, batch_size=1, num_workers=4)

In [4]:
"""To CUDA for local run"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)


#GPUID = '4' # define GPUID
#os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUID)


cuda:0


Problem 1: Design LSTM model for catcha image recognition. (10 points)

[Captioning Images with CNN and RNN, using PyTorch](https://medium.com/@stepanulyanin/captioning-images-with-pytorch-bc592e5fd1a3)

In [113]:
class LSTM(nn.Module):
    def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
        super(LSTM, self).__init__()
        
        # define the properties
        self.cnn_dim = cnn_dim
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        # lstm cell
        self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)
    
        # output fully connected layer
        self.fc_in = nn.Linear(in_features=self.cnn_dim, out_features=self.vocab_size)
        self.fc_out = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)
    
        # embedding layer
        self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)
    
        # activations
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, features, captions):

        batch_size = features.size(0)
        cnn_dim = features.size(1)

        hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda()
        cell_state = torch.zeros((batch_size, self.hidden_size)).cuda()
    
        # define the output tensor placeholder
        outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()

        # embed the captions
        captions_embed = self.embed(captions)
        
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
        # pass the caption word by word
        for t in range(captions.size(1)):
            # for the first time step the input is the feature vector
            if t == 0:
                features = features[:, :, 0, 0]
                inputs = self.fc_in(features)
                hidden_state, cell_state = self.lstm_cell(inputs, (hidden_state, cell_state))
                
            # for the 2nd+ time step, using teacher forcer
            else:
                hidden_state, cell_state = self.lstm_cell(captions_embed[:, t, :], (hidden_state, cell_state))
            
#             print(hidden_state.size())
            # output of the attention mechanism
            out = self.fc_out(hidden_state)
            # build the output tensor
            outputs[:, t, :] = out
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################
        return outputs[:,:,0]




Problem 2: 

*   1.Connect CNN model to the designed LSTM model.
*   2.Replace ResNet to your own CNN model from Assignment3.
* https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning

In [114]:

##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
"""ResNet"""
#CNN
betternet = resnet.resnet18(pretrained=False)
betternet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
# betternet.fc = nn.Linear(in_features=512, out_features=ALL_CHAR_SET_LEN, bias=True)
betternet.fc = nn.Linear(in_features=512, out_features=ALL_CHAR_SET_LEN*MAX_CAPTCHA, bias=True)
betternet = betternet.to(device)
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################

       
# LSTM
cnn_dim=512 #resnet18-512
hidden_size=8
vocab_size=37 #ALL_CHAR_SET_LEN
lstm = LSTM(cnn_dim=cnn_dim, hidden_size=hidden_size, vocab_size=vocab_size)
lstm = lstm.to(device)

# loss, optimizer
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################

class MyEnsemble(nn.Module):
    def __init__(self, modelA, modelB):
        super(MyEnsemble, self).__init__()
        self.modelA = modelA
        self.modelB = modelB
        
    def forward(self, x, captions):
        _, feature = self.modelA(x)
        pred = self.modelB(feature, captions)
        return pred

model = MyEnsemble(betternet, lstm)
# print(model)

# loss_func = nn.MultiLabelSoftMarginLoss()
# cnn_optim = torch.optim.Adam(betternet.parameters(), lr=0.001)

loss_func = nn.MultiLabelSoftMarginLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################

Problem3: Find hyper-parameters.


In [115]:
import time

"""TRAINING"""
print_interval = 1
max_epoch = 10

start_time = time.time()
for epoch in range(max_epoch):
    start_epoch_time = time.time()
    for step, i in enumerate(train_dl):
        start_step_time = time.time()
        img, label_oh, label = i
        img = Variable(img).cuda()
        label_oh = Variable(label_oh.long()).cuda()
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
        batch_size, _ = label_oh.shape
    
#         pred, feature = betternet(img)
#         loss = loss_func(pred, label_oh)
#         cnn_optim.zero_grad()
#         loss.backward()
#         cnn_optim.step()  
        
        pred = model(img, label_oh)
        loss = loss_func(pred, label_oh)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()    
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################
        if (epoch+1)%print_interval == 0:
            print('epoch:', epoch+1, 'step:', step+1, 'loss:', loss.item())
    print('>> Epoch', epoch+1, 'elapsed time: {:.2f} sec'.format(time.time()-start_epoch_time))
print('Total Elapsed Time: {:.2f} sec'.format(time.time()-start_time))

epoch: 1 step: 1 loss: 0.5943472981452942
epoch: 1 step: 2 loss: 0.4964393973350525
epoch: 1 step: 3 loss: 0.4219038486480713
epoch: 1 step: 4 loss: 0.3704468309879303
epoch: 1 step: 5 loss: 0.33152222633361816
epoch: 1 step: 6 loss: 0.2994146943092346
epoch: 1 step: 7 loss: 0.27342331409454346
epoch: 1 step: 8 loss: 0.2534462809562683
>> Epoch 1 elapsed time: 1.60 sec
epoch: 2 step: 1 loss: 0.23780232667922974
epoch: 2 step: 2 loss: 0.22434869408607483
epoch: 2 step: 3 loss: 0.2120894193649292
epoch: 2 step: 4 loss: 0.20072072744369507
epoch: 2 step: 5 loss: 0.19015951454639435
epoch: 2 step: 6 loss: 0.18037459254264832
epoch: 2 step: 7 loss: 0.17136701941490173
epoch: 2 step: 8 loss: 0.1629306823015213
>> Epoch 2 elapsed time: 1.61 sec
epoch: 3 step: 1 loss: 0.15508535504341125
epoch: 3 step: 2 loss: 0.14766716957092285
epoch: 3 step: 3 loss: 0.14059436321258545
epoch: 3 step: 4 loss: 0.1337914615869522
epoch: 3 step: 5 loss: 0.12716826796531677
epoch: 3 step: 6 loss: 0.1206906959414

In [117]:
"""TEST"""
def get_char_count(arg1):
    c0 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[0:ALL_CHAR_SET_LEN])]
    c1 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN:ALL_CHAR_SET_LEN*2])]
    c2 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*2:ALL_CHAR_SET_LEN*3])]
    c3 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*3:ALL_CHAR_SET_LEN*4])]
    c4 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*4:ALL_CHAR_SET_LEN*5])]
    c5 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*5:ALL_CHAR_SET_LEN*6])]
    c6 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*6:ALL_CHAR_SET_LEN*7])]
    return c0, c1, c2, c3, c4, c5, c6 

def get_str(ch_arr):
    ch_str = ''
    for ch in ch_arr:
        if ch == 'NONE':
            ch_str = ch_str + '_'
        else:
            ch_str = ch_str + ch
    return ch_str

char_correct = 0
word_correct = 0
total = 0

betternet.eval()
lstm.eval()

with torch.no_grad():
    for step, (img, label_oh, label) in enumerate(test_dl):
        char_count = 0
        img = Variable(img).cuda()
        label_oh = Variable(label_oh.long()).cuda()
        
#         pred, feature = betternet(img)
        pred = model(img, captions)

        label_len = label[0]
        pred = pred.squeeze(0)
        label_oh = label_oh.squeeze(0)
        
        c0,c1,c2,c3,c4,c5,c6 = get_char_count(pred.squeeze()) 
        d0,d1,d2,d3,d4,d5,d6 = get_char_count(label_oh) 
         
        c_arr = (c0, c1, c2, c3, c4, c5, c6)
        d_arr = (d0, d1, d2, d3, d4, d5, d6)
        
        c = '%s%s%s%s%s%s%s' % c_arr
        d = '%s%s%s%s%s%s%s' % d_arr
        
        c_str = get_str(c_arr)
        d_str = get_str(d_arr)
        
        print('PREDICT:', c_str, ', LABEL:', d_str)
    
        char_count += (c0==d0)+(c1==d1)+(c2==d2)+(c3==d3)+(c4==d4)+(c5==d5)+(c6==d6)
        char_correct += char_count

        if(bool(str(label[0]) in str(c))):
            word_correct+=1

        total += 1
       
print(100/7*char_correct/total)
print(100*word_correct/total)
"""END TEST"""

PREDICT: oi87gb_ , LABEL: b9x____
PREDICT: oi87gb_ , LABEL: mb_____
PREDICT: oi87gb_ , LABEL: d5q7qh_
PREDICT: oi87gb_ , LABEL: 6tl0kqv
PREDICT: oi87gb_ , LABEL: t1_____
PREDICT: oi87gb_ , LABEL: avhjn3z
PREDICT: oi87gb_ , LABEL: 74z0z__
PREDICT: oi87gb_ , LABEL: f1kfa__
PREDICT: oi87gb_ , LABEL: sripns_
PREDICT: oi87gb_ , LABEL: bg4____
PREDICT: oi87gb_ , LABEL: gmb45tz
PREDICT: oi87gb_ , LABEL: sr5____
PREDICT: oi87gb_ , LABEL: 0nt____
PREDICT: oi87gb_ , LABEL: lxfg98_
PREDICT: oi87gb_ , LABEL: 2b8o___
PREDICT: oi87gb_ , LABEL: kr25___
PREDICT: oi87gb_ , LABEL: fl_____
PREDICT: oi87gb_ , LABEL: 0tiwrd_
PREDICT: oi87gb_ , LABEL: k5_____
PREDICT: oi87gb_ , LABEL: 8k_____
PREDICT: oi87gb_ , LABEL: ggin___
PREDICT: oi87gb_ , LABEL: qc6e___
PREDICT: oi87gb_ , LABEL: giz6rv_
PREDICT: oi87gb_ , LABEL: tf15___
PREDICT: oi87gb_ , LABEL: 7jz____
PREDICT: oi87gb_ , LABEL: v3zl9__
PREDICT: oi87gb_ , LABEL: p78ec__
PREDICT: oi87gb_ , LABEL: 7rh____
PREDICT: oi87gb_ , LABEL: exqo___
PREDICT: oi87g

PREDICT: oi87gb_ , LABEL: sj_____
PREDICT: oi87gb_ , LABEL: 1suyf8_
PREDICT: oi87gb_ , LABEL: fblzo__
PREDICT: oi87gb_ , LABEL: gvlhn__
PREDICT: oi87gb_ , LABEL: 0jj7w__
PREDICT: oi87gb_ , LABEL: e4o89r_
PREDICT: oi87gb_ , LABEL: clwllf0
PREDICT: oi87gb_ , LABEL: nq_____
PREDICT: oi87gb_ , LABEL: oljje__
PREDICT: oi87gb_ , LABEL: wjrvnu_
PREDICT: oi87gb_ , LABEL: 6k8s94_
PREDICT: oi87gb_ , LABEL: pjd____
PREDICT: oi87gb_ , LABEL: ekgvm__
PREDICT: oi87gb_ , LABEL: bo4m0__
PREDICT: oi87gb_ , LABEL: ultrlr_
PREDICT: oi87gb_ , LABEL: t1_____
PREDICT: oi87gb_ , LABEL: kq865m_
PREDICT: oi87gb_ , LABEL: 5zw8___
PREDICT: oi87gb_ , LABEL: 3d_____
PREDICT: oi87gb_ , LABEL: blvaodq
PREDICT: oi87gb_ , LABEL: 0myq___
PREDICT: oi87gb_ , LABEL: 4qdr___
PREDICT: oi87gb_ , LABEL: ssj____
PREDICT: oi87gb_ , LABEL: y7_____
PREDICT: oi87gb_ , LABEL: 1lxjdnl
PREDICT: oi87gb_ , LABEL: szg____
PREDICT: oi87gb_ , LABEL: ac8____
PREDICT: oi87gb_ , LABEL: 8qb____
PREDICT: oi87gb_ , LABEL: 5o_____
PREDICT: oi87g

PREDICT: oi87gb_ , LABEL: lsr____
PREDICT: oi87gb_ , LABEL: hvhj5__
PREDICT: oi87gb_ , LABEL: d76____
PREDICT: oi87gb_ , LABEL: 502tru_
PREDICT: oi87gb_ , LABEL: 33mykb_
PREDICT: oi87gb_ , LABEL: 8fbh___
PREDICT: oi87gb_ , LABEL: 2ase___
PREDICT: oi87gb_ , LABEL: axi1m__
PREDICT: oi87gb_ , LABEL: 43_____
PREDICT: oi87gb_ , LABEL: 7jy____
PREDICT: oi87gb_ , LABEL: j9yl___
PREDICT: oi87gb_ , LABEL: fu_____
PREDICT: oi87gb_ , LABEL: yv2____
PREDICT: oi87gb_ , LABEL: 27f____
PREDICT: oi87gb_ , LABEL: i1u____
PREDICT: oi87gb_ , LABEL: j0ty___
PREDICT: oi87gb_ , LABEL: qba4tor
PREDICT: oi87gb_ , LABEL: cai____
PREDICT: oi87gb_ , LABEL: vf_____
PREDICT: oi87gb_ , LABEL: sdy____
PREDICT: oi87gb_ , LABEL: 1wk2pi_
PREDICT: oi87gb_ , LABEL: 7v_____
PREDICT: oi87gb_ , LABEL: 8rvuoi_
PREDICT: oi87gb_ , LABEL: aa_____
PREDICT: oi87gb_ , LABEL: 79cnkb_
PREDICT: oi87gb_ , LABEL: 981o0cz
PREDICT: oi87gb_ , LABEL: 2k_____
PREDICT: oi87gb_ , LABEL: 98_____
PREDICT: oi87gb_ , LABEL: tg_____
PREDICT: oi87g

PREDICT: oi87gb_ , LABEL: 6c8n___
PREDICT: oi87gb_ , LABEL: vohcc__
PREDICT: oi87gb_ , LABEL: 0f3ad__
PREDICT: oi87gb_ , LABEL: a8wmu__
PREDICT: oi87gb_ , LABEL: 1dd4___
PREDICT: oi87gb_ , LABEL: e0hx___
PREDICT: oi87gb_ , LABEL: af7g___
PREDICT: oi87gb_ , LABEL: x3nr9__
PREDICT: oi87gb_ , LABEL: wd_____
PREDICT: oi87gb_ , LABEL: a2_____
PREDICT: oi87gb_ , LABEL: vd_____
PREDICT: oi87gb_ , LABEL: wu7hqy_
PREDICT: oi87gb_ , LABEL: bv3k___
PREDICT: oi87gb_ , LABEL: 1k5____
PREDICT: oi87gb_ , LABEL: acq____
PREDICT: oi87gb_ , LABEL: 4f_____
PREDICT: oi87gb_ , LABEL: 4pu5t__
PREDICT: oi87gb_ , LABEL: fsbfk5z
PREDICT: oi87gb_ , LABEL: zv_____
PREDICT: oi87gb_ , LABEL: 08p____
PREDICT: oi87gb_ , LABEL: svixy__
PREDICT: oi87gb_ , LABEL: qjidg__
PREDICT: oi87gb_ , LABEL: fn_____
PREDICT: oi87gb_ , LABEL: p4tvr__
PREDICT: oi87gb_ , LABEL: pl2____
PREDICT: oi87gb_ , LABEL: 5q5rs__
PREDICT: oi87gb_ , LABEL: fe0vt3z
PREDICT: oi87gb_ , LABEL: go9____
PREDICT: oi87gb_ , LABEL: b0dlqd_
PREDICT: oi87g

PREDICT: oi87gb_ , LABEL: rp8____
PREDICT: oi87gb_ , LABEL: x5kn5__
PREDICT: oi87gb_ , LABEL: imcmab_
PREDICT: oi87gb_ , LABEL: 4r46f9_
PREDICT: oi87gb_ , LABEL: py53d9_
PREDICT: oi87gb_ , LABEL: bp_____
PREDICT: oi87gb_ , LABEL: 56tt___
PREDICT: oi87gb_ , LABEL: 8tbvuk_
PREDICT: oi87gb_ , LABEL: d9uvy5i
PREDICT: oi87gb_ , LABEL: 3x82lj_
PREDICT: oi87gb_ , LABEL: pko____
PREDICT: oi87gb_ , LABEL: ff_____
PREDICT: oi87gb_ , LABEL: xop9___
PREDICT: oi87gb_ , LABEL: euxe___
PREDICT: oi87gb_ , LABEL: qcn____
PREDICT: oi87gb_ , LABEL: 54vwkxr
PREDICT: oi87gb_ , LABEL: gk_____
PREDICT: oi87gb_ , LABEL: xpg____
PREDICT: oi87gb_ , LABEL: crqlc__
PREDICT: oi87gb_ , LABEL: oi87gb_
13.62857142857143
0.1


'END TEST'