<a href="https://colab.research.google.com/github/namwootree/Basic_Skill/blob/main/PyTorch/Transformer_%EA%B5%AC%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

In [1]:
! pip install Korpora sentencepiece einops wandb torch-summary -qq

[K     |████████████████████████████████| 57 kB 5.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 26.8 MB/s 
[K     |████████████████████████████████| 1.8 MB 50.3 MB/s 
[K     |████████████████████████████████| 96 kB 7.7 MB/s 
[K     |████████████████████████████████| 181 kB 18.0 MB/s 
[K     |████████████████████████████████| 145 kB 44.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [16]:
# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Dataset
from Korpora import Korpora
import pandas as pd
import numpy as np
from einops import rearrange, reduce, repeat

# Tokenizer
import sentencepiece as spm

# Training
import time
import copy
from collections import defaultdict
from sklearn.metrics import mean_squared_error

# 시스템
import wandb
from torch.cuda import amp
import gc
import os

# 기타
from tqdm import tqdm

In [9]:
VOCAB_SIZE = 32000 + 7
SEQ_LEN = 100
PAD_IDX = 0

# Trainig Set 모집단의 크기
TRAINSET_SIZE = 120000

# 실제로 사용할 Training Set의 크기. 이 수만큼 전체 Training Set에서 Random Sampling
TRAIN_LEN = 100000
VALID_LEN = 10000
BATCH_SIZE = 2
WANDB_SAVED_PATH = ''

if 'device' not in globals():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f'Using {device}')

Using cuda


In [4]:
os.environ["WANDB_API_KEY"] = "10a106cc0c288b92e67a1540a17ddbeb973957b3"
os.environ["WANDB_MODE"] = "dryrun"
wandb.init(project="Transformer", entity="namwootree")
RUN_PATH = ''

# Data Set

### Create Data

In [5]:
corpus = Korpora.load("open_subtitles", root_dir='./')


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : TRAC (https://trac.edgewall.org/)
    Repository : http://opus.nlpl.eu/OpenSubtitles-v2018.php
    References :
        - P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora
          from Movie and TV Subtitles. In Proceedings of the 10th International Conference on
          Language Resources and Evaluation (LREC 2016)

    This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.

    [[ IMPORTANT ]]
    If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/
    to your website and to your reports and publications produced with the data!
    I promised this when I got the data from the providers of that website!

 

[open_subtitles] download en-ko.tmx.gz: 48.1MB [00:01, 32.0MB/s]                            


decompress /content/open_subtitles/en-ko.tmx.gz


In [6]:
data = pd.DataFrame([corpus.train.pairs, corpus.train.texts], index = ['src', 'trg'])
data = data.transpose()
data.to_csv('data.txt', index=False)
data.head()

Unnamed: 0,src,trg
0,"Through the snow and sleet and hail, through t...","폭설이 내리고 우박, 진눈깨비가 퍼부어도 눈보라가 몰아쳐도 강풍이 불고 비바람이 휘..."
1,"ever faithful, ever true, nothing stops him, h...",우리의 한결같은 심부름꾼 황새 아저씨 가는 길을 그 누가 막으랴!
2,Look out for Mr Stork That persevering chap,황새 아저씨를 기다리세요
3,He'll come along and drop a bundle in your lap,찾아와 선물을 주실 거예요
4,You may be poor or rich It doesn't matter which,가난하든 부자이든 상관이 없답니다


### Preprocessing

In [7]:
with open('src.txt', mode = 'w', encoding='utf8') as f:
    f.write('\n'.join(data['src']))
with open('trg.txt', mode= 'w', encoding='utf8') as f:
    f.write('\n'.join(data['trg']))

In [10]:
corpus = "src.txt"
prefix = "src"
vocab_size = VOCAB_SIZE - 7
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" +
    " --model_type=bpe" +
    " --max_sentence_length=999999" + 
    " --pad_id=0 --pad_piece=[PAD]" + 
    " --unk_id=1 --unk_piece=[UNK]" +  
    " --bos_id=2 --bos_piece=[BOS]" +  
    " --eos_id=3 --eos_piece=[EOS]" +  
    " --user_defined_symbols=[SEP],[CLS],[MASK]")  

corpus = "trg.txt"
prefix = "trg"
vocab_size = VOCAB_SIZE - 7
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" +
    " --model_type=bpe" +
    " --max_sentence_length=999999" + 
    " --pad_id=0 --pad_piece=[PAD]" +  
    " --unk_id=1 --unk_piece=[UNK]" +  
    " --bos_id=2 --bos_piece=[BOS]" +  
    " --eos_id=3 --eos_piece=[EOS]" +  
    " --user_defined_symbols=[SEP],[CLS],[MASK]") 

In [17]:
def en_encode(tmpstr:str) -> np.array :
    tmpstr = np.array(sp_src.EncodeAsIds(tmpstr))

    # SEQ_LEN보다 길면 짜른다 
    if len(tmpstr) > SEQ_LEN :
        tmpstr = tmpstr[:SEQ_LEN]

    # SEQ_LEN보다 작으면 padding
    else :
        tmpstr = np.pad(tmpstr, (0, SEQ_LEN - len(tmpstr)), 'constant', constant_values = sp_src.pad_id())
    
    return tmpstr

In [18]:
sp_src = spm.SentencePieceProcessor()
sp_src.Load('src.model')

src_data = data['src']

src_list = []

for idx in tqdm(range(len(src_data))):
    src_list.append(en_encode(src_data[idx]))

100%|██████████| 1269683/1269683 [02:00<00:00, 10560.87it/s]


In [19]:
def ko_encode(tmpstr: str) -> np.array:
    tmpstr = np.array(sp_trg.EncodeAsIds(tmpstr))
    tmpstr = np.insert(tmpstr, 0, sp_trg.bos_id())

    if len(tmpstr) >= SEQ_LEN:
        # SEQ_LEN -1의 길이로 자른다
        tmpstr = tmpstr[:SEQ_LEN-1]
        # 마지막에 <eos> 토큰을 넣어줌으로써, 길이를 SEQ_LEN으로 맞춘다
        tmpstr = np.pad(tmpstr, (0, 1),
                        'constant', constant_values=sp_trg.eos_id())


    else:
        tmpstr = np.pad(tmpstr, (0, 1),
                        'constant', constant_values=sp_trg.eos_id())
        tmpstr = np.pad(tmpstr, (0, SEQ_LEN - len(tmpstr)),
                        'constant', constant_values=sp_trg.pad_id())

    return tmpstr

In [20]:
sp_trg = spm.SentencePieceProcessor()
sp_trg.Load('trg.model')

trg_data = data['trg']

trg_list = []

for idx in tqdm(range(len(trg_data))):
    trg_list.append(ko_encode(trg_data[idx])) 

100%|██████████| 1269683/1269683 [02:28<00:00, 8574.11it/s]


### Split Data

In [21]:
train_mask = np.random.choice(len(src_list[:TRAINSET_SIZE]), size = TRAIN_LEN, replace = False)
valid_mask = np.random.choice(len(trg_list[TRAINSET_SIZE:]), size = VALID_LEN, replace = False)

src_train = np.take(src_list, train_mask, axis = 0)
trg_train = np.take(trg_list, train_mask, axis = 0)

src_valid = np.take(src_list, valid_mask, axis = 0)
trg_valid = np.take(trg_list, valid_mask, axis = 0)

In [22]:
print(src_train.shape)
print(trg_train.shape)
print(src_valid.shape)
print(trg_valid.shape)

(100000, 100)
(100000, 100)
(10000, 100)
(10000, 100)


In [23]:
gc.collect()

452

### Create DataSet

In [24]:
class TrainDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        assert len(src_data) == len(trg_data)

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg_input = self.trg_data[idx]
        trg_output = trg_input[1:SEQ_LEN]
        trg_output = np.pad(trg_output, (0,1), 'constant', constant_values =0)
        # (seq_len,)
        return src, trg_input, trg_output

train_dataset = TrainDataset(src_train, trg_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle= True, pin_memory=True)

In [25]:
class ValidDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        assert len(src_data) == len(trg_data)

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg_input = self.trg_data[idx]
        trg_output = trg_input[1:SEQ_LEN]
        trg_output = np.pad(trg_output, (0,1), 'constant',constant_values= 0)

        return src, trg_input, trg_output

valid_dataset = ValidDataset(src_valid, trg_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle= False, pin_memory=True)

In [26]:
for src, trg_input, trg_output in train_dataloader:
    print(src.shape)
    print(trg_input.shape)
    print(trg_output.shape)
    break

torch.Size([2, 100])
torch.Size([2, 100])
torch.Size([2, 100])


# Transformer

### Mask function

In [27]:
def makeMask(tensor, option: str) -> torch.Tensor:

    if option == 'padding':
        tmp = torch.full_like(tensor, fill_value=PAD_IDX).to(device)
        # tmp : (bs,seq_len)
        mask = (tensor != tmp).float()
        # mask : (bs, seq_len)
        mask = repeat(mask, 'bs seq_len -> bs new_axis seq_len ',
                      new_axis=mask.shape[1])
        # mask(bs,seq_len,seq_len)

    elif option == 'lookahead':
        padding_mask = makeMask(tensor, 'padding')

        mask = torch.ones_like(padding_mask)
        mask = torch.tril(mask)

        mask = mask * padding_mask

    return mask

### Multihead Attention

In [28]:
class Multiheadattention(nn.Module):
    def __init__(self, hidden_dim: int, num_head: int):
        super().__init__()

        # embedding_dim, d_model, 512 in paper
        self.hidden_dim = hidden_dim
        # 8 in paper
        self.num_head = num_head
        # head_dim, d_key, d_query, d_value, 64 in paper (= 512 / 8)
        self.head_dim = hidden_dim // num_head
        self.scale = torch.sqrt(torch.FloatTensor()).to(device)

        self.fcQ = nn.Linear(hidden_dim, hidden_dim)
        self.fcK = nn.Linear(hidden_dim, hidden_dim)
        self.fcV = nn.Linear(hidden_dim, hidden_dim)
        self.fcOut = nn.Linear(hidden_dim, hidden_dim)



    def forward(self, srcQ, srcK, srcV, mask=None):

        ##### SCALED DOT PRODUCT ATTENTION ######

        # input : (bs, seq_len, hidden_dim)
        Q = self.fcQ(srcQ)
        K = self.fcK(srcK)
        V = self.fcV(srcV)

        Q = rearrange(
            Q, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        K_T = rearrange(
            K, 'bs seq_len (num_head head_dim) -> bs num_head head_dim seq_len', num_head=self.num_head)
        V = rearrange(
            V, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)

        attention_energy = torch.matmul(Q, K_T)
        # attention_energy : (bs, num_head, seq_len, seq_len)

        if mask is not None :
            attention_energy : torch.masked_fill(attention_energy, (mask==0), -1e10)

        attention_energy = torch.softmax(attention_energy, dim = -1)
        # print(attention_energy[0,0,0,:])

        result = torch.matmul(attention_energy,V)
        # result (bs, num_head, seq_len, head_dim)

        ##### END OF SCALED DOT PRODUCT ATTENTION ######

        # CONCAT
        result = rearrange(result, 'bs num_head seq_len head_dim -> bs seq_len (num_head head_dim)')
        # result : (bs, seq_len, hidden_dim)

        # LINEAR

        result = self.fcOut(result)

        return result

### Poistionwise Feedforward Network

In [29]:
class FFN(nn.Module):
    def __init__ (self, hidden_dim, inner_dim):
        super().__init__()

        # 512 in paper 
        self.hidden_dim = hidden_dim
        # 2048 in paper
        self.inner_dim = inner_dim 

        self.fc1 = nn.Linear(hidden_dim, inner_dim)
        self.fc2 = nn.Linear(inner_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=False)

        
    def forward(self, input):
        output = input
        output = self.fc1(output)
        output2 = self.relu(output)
        output3 = self.fc2(output2)

        return output3

### Encoder Layer

In [30]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim
        
        self.multiheadattention = Multiheadattention(hidden_dim, num_head)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm = nn.LayerNorm(hidden_dim)

        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)


    def forward(self, input, mask = None):

        # input : (bs, seq_len, hidden_dim)
        
        # encoder attention
        # uses only padding mask
        output = self.multiheadattention(srcQ= input, srcK = input, srcV = input, mask = mask)
        output = self.dropout1(output)
        output = input + output
        output = self.layerNorm(output)

        output_ = self.ffn(output)
        output_ = self.dropout2(output_)
        output = output + output_
        output = self.layerNorm(output)

        # output : (bs, seq_len, hidden_dim)
        return output

### Encoder Architecture

In [35]:
class Encoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.enc_layers = nn.ModuleList(EncoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N))

        self.dropout = nn.Dropout(p=0.1)



    def forward(self, input):

        # input : (bs, seq_len)
        mask = makeMask(input, option='padding')

        # embedding layer
        output = self.embedding(input)

        output = self.dropout(output)

        # N encoder layer
        for layer in self.enc_layers:
            output = layer(output, mask)

        # output : (bs, seq_len, hidden_dim)

        return output


### Decoder Layer

In [36]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.multiheadattention1 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.multiheadattention2 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm3 = nn.LayerNorm(hidden_dim)

        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)
        self.dropout3 = nn.Dropout(p=0.1)

    
    def forward(self, input, enc_output, paddingMask, lookaheadMask):
        # input : (bs, seq_len, hidden_dim)

        # first multiheadattention
        output = self.multiheadattention1(input, input, input, lookaheadMask)
        output = self.dropout1(output)
        output = output + input
        output = self.layerNorm1(output)

        # second multiheadattention
        output_ = self.multiheadattention2(output, enc_output, enc_output, paddingMask)
        output_ = self.dropout2(output_)
        output = output_ + output
        output = self.layerNorm2(output)

        # Feedforward Network
        output_ = self.ffn(output)
        output_ = self.dropout3(output_)
        output = output + output_
        output = self.layerNorm3(output)

        return output

### Decoder Architecture

In [37]:
class Decoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)

        self.dec_layers = nn.ModuleList(DecoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N))

        self.dropout = nn.Dropout(p=0.1)
        
        self.finalFc = nn.Linear(hidden_dim, VOCAB_SIZE)


    def forward(self, input, enc_src, enc_output):

        # input : (bs, seq_len)
        # enc_src : (bs, seq_len)
        # enc_output : (bs, seq_len,hidden_dim)

        lookaheadMask = makeMask(input, option='lookahead')
        paddingMask = makeMask(enc_src, option = 'padding')

        # embedding layer
        output = self.embedding(input)

        # Dropout
        output = self.dropout(output)

        # N decoder layer
        for layer in self.dec_layers:
            output = layer(output, enc_output, paddingMask, lookaheadMask)
        # output : (bs, seq_len, hidden_dim)

        logits = self.finalFc(output)
        # logits : (bs, seq_len, VOCAB_SIZE)
        output = torch.softmax(logits, dim = -1)

        output = torch.argmax(output, dim = -1)
        # output : (bs, seq_len)



        return output, logits


### Transformer Model

In [38]:
class Transformer(nn.Module):
    def __init__(self, N = 6, hidden_dim = 512, num_head = 8, inner_dim = 2048):
        super().__init__()
        self.encoder = Encoder(N, hidden_dim, num_head, inner_dim)
        self.decoder = Decoder(N, hidden_dim, num_head, inner_dim)

    def forward(self, enc_src, dec_src):
        # enc_src : (bs, seq_len)
        # dec_src : (bs, seq_len)

        # print(f'enc_src : {enc_src.shape}')
        # print(f'dec_src : {dec_src.shape}')

        enc_output = self.encoder(enc_src)
        output, logits = self.decoder(dec_src, enc_src, enc_output.detach())
        # logits = (bs, seq_len, VOCAB_SIZE) 

        return output, logits

# Model Train

In [39]:
model = Transformer().to(device)

In [40]:
from torchsummary import summary
test1 = torch.randint(low = 0, high = 1000, size = (SEQ_LEN,))
test2 = torch.randint(low = 0, high = 1000, size = (SEQ_LEN,))
summary(model, [(SEQ_LEN,), (SEQ_LEN,)], dtypes = [torch.int, torch.int])

Layer (type:depth-idx)                        Output Shape              Param #
├─Encoder: 1-1                                [-1, 100, 512]            --
|    └─Embedding: 2-1                         [-1, 100, 512]            16,387,584
|    └─Dropout: 2-2                           [-1, 100, 512]            --
|    └─ModuleList: 2                          []                        --
|    |    └─EncoderLayer: 3-1                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-2                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-3                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-4                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-5                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-6                 [-1, 100, 512]            3,151,360
├─Decoder: 1-2                                [-1, 100]                 --
|    └─Embedding: 2-3                        

Layer (type:depth-idx)                        Output Shape              Param #
├─Encoder: 1-1                                [-1, 100, 512]            --
|    └─Embedding: 2-1                         [-1, 100, 512]            16,387,584
|    └─Dropout: 2-2                           [-1, 100, 512]            --
|    └─ModuleList: 2                          []                        --
|    |    └─EncoderLayer: 3-1                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-2                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-3                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-4                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-5                 [-1, 100, 512]            3,151,360
|    |    └─EncoderLayer: 3-6                 [-1, 100, 512]            3,151,360
├─Decoder: 1-2                                [-1, 100]                 --
|    └─Embedding: 2-3                        

In [41]:
for param in model.named_parameters():
    if 'weight' in param[0] and 'layerNorm' not in param[0] :
        torch.nn.init.xavier_uniform_(param[1])

In [42]:
optimizer = torch.optim.Adam(params = model.parameters())

In [45]:
def criterion(logits: torch.tensor, targets: torch.tensor):
    return nn.CrossEntropyLoss(ignore_index=PAD_IDX)(logits.view(-1,VOCAB_SIZE), targets.view(-1))

In [46]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    # train 모드로 변경
    model.train()

    # for the Mixed Precision
    # Pytorch 예제 : https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples
    scaler = amp.GradScaler()

    dataset_size = 0
    running_loss = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for step, (src, trg_input, trg_output) in bar:
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_output = trg_output.to(device)

        batch_size = src.shape[0]

        with amp.autocast(enabled=True):
            output, logits = model(enc_src = src, dec_src = trg_input)
            loss = criterion(logits, trg_output)
            
        # logits (bs, seq_len, VOCAB_SIZE)
        # trg_output (bs, seq_len)

        # loss를 Scale
        # Scaled Grdients를 계산(call)하기 위해 scaled loss를 backward()
        scaler.scale(loss).backward()
        # loss.backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)
        # optimizer.step()
        
        # Updates the scale for next iteration.
        scaler.update()

        # zero the parameter gradients
        optimizer.zero_grad()

        # change learning rate by Scheduler
        if scheduler is not None:
            scheduler.step()

        # loss.item()은 loss를 Python Float으로 반환
        # loss.item()은 batch data의 average loss이므로, sum of loss를 구하기 위해 batch_size를 곱해준다
        running_loss += loss.item() * batch_size
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(
            Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"]
        )

    # Garbage Collector
    gc.collect()

    return epoch_loss

In [47]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0


    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg_input, trg_output) in bar:
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_output = trg_output.to(device)

        batch_size = src.shape[0]

        output, logits = model(enc_src = src, dec_src = trg_input)
        loss = criterion(logits, trg_output)

        running_loss += loss.item() * batch_size
        dataset_size += batch_size

        # 실시간으로 정보를 표시하기 위한 epoch loss
        val_loss = running_loss / dataset_size

        bar.set_postfix(
            Epoch=epoch, Valid_Loss=val_loss, LR=optimizer.param_groups[0]["lr"]
        )

    gc.collect()

    return val_loss

In [48]:
def run_training(
    model,
    optimizer,
    scheduler,
    device,
    num_epochs,
    metric_prefix="",
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
):
    # To automatically log graidents
    wandb.watch(model, log_freq=100)

    if torch.cuda.is_available():
        print("[INFO] Using GPU:{}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    history = defaultdict(list)
    early_stop_counter = 0

    # num_epochs만큼, train과 val을 실행한다
    for epoch in range(1, num_epochs + 1):
        gc.collect()

        train_epoch_loss = train_one_epoch(
            model,
            optimizer,
            scheduler,
            dataloader= train_dataloader,
            device=device,
            epoch=epoch,
        )

        val_loss = valid_one_epoch(
            model, valid_dataloader, device=device, epoch=epoch
        )

        history[f"{metric_prefix}Train Loss"].append(train_epoch_loss)
        history[f"{metric_prefix}Valid Loss"].append(val_loss)

        # Log the metrics
        wandb.log(
            {
                f"{metric_prefix}Train Loss": train_epoch_loss,
                f"{metric_prefix}Valid Loss": val_loss,
            }
        )

        print(f"Valid Loss : {val_loss}")

        # deep copy the model
        if val_loss <= best_loss:
            early_stop_counter = 0

            print(
                f"Validation Loss improved( {best_loss} ---> {val_loss}  )"
            )

            # Update Best Loss
            best_loss = val_loss
            
            # Update Best Model Weight
            # run.summary['Best RMSE'] = best_loss
            best_model_wts = copy.deepcopy(model.state_dict())

            PATH = "{}epoch{:.0f}_Loss{:.4f}.bin".format(file_prefix, epoch, best_loss)
            torch.save(model.state_dict(), PATH)
            torch.save(model.state_dict(), f"{file_prefix}best_{epoch}epoch.bin")
            # Save a model file from the current directory
            wandb.save(PATH)

            print(f"Model Saved")

        elif early_stopping:
            early_stop_counter += 1
            if early_stop_counter > early_stopping_step:
                break

        print()

    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print("Best Loss: {:.4f}".format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history

In [None]:
run_training(
    model = model,
    optimizer = optimizer,
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=100, eta_min=1e-6),
    device = device,
    num_epochs = 1,
    metric_prefix="",
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
)

[INFO] Using GPU:Tesla V100-SXM2-16GB



  2%|▏         | 1079/50000 [01:20<55:30, 14.69it/s, Epoch=1, LR=0.000106, Train_Loss=8.27]

In [None]:
torch.save(model.state_dict(), 'final.bin')
wandb.save('final.bin')
wandb.finish()