# Welcome to Torch Study 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, Iterator

import spacy
import numpy as np

import random
import math
import time

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [5]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)



In [7]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))



In [8]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


We can also print out an example, making sure the source sentence is reversed:

In [9]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


## most frequent인 n개만 사용하는 거 

In [10]:
SRC_MOST_FREQ = 4000
TRG_MOST_FREQ = 2000

In [11]:
SRC.build_vocab(train_data, max_size = SRC_MOST_FREQ)
TRG.build_vocab(train_data, max_size = TRG_MOST_FREQ)

In [12]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 4004
Unique tokens in target (en) vocabulary: 2004


In [13]:
SRC.vocab.stoi[4]

0

In [14]:
len(SRC.vocab.stoi), len(TRG.vocab.stoi) # special token 포함

(4005, 2004)

In [15]:
# .to('cuda')는 가장 마지막(모델에 넣을때) 해주는게 가장 효율적임
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
# device = 'cpu'
BATCH_SIZE = 128

### BucketIterator 

In [16]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_within_batch = True)



In [20]:
from torchtext.data import Example
import random

In [30]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [31]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4004, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2004, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=2004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

We also define a function that will calculate the number of trainable parameters in the model.

In [32]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,922,516 trainable parameters


We define our optimizer, which we use to update our parameters in the training loop. Check out [this](http://ruder.io/optimizing-gradient-descent/) post for information about different optimizers. Here, we'll use Adam.

## SGD optimizer + halving learning rate every half epoch
train을 7.5 epoch하고, learning rate도 5를 넘는 half epoch마다 lr을 halving 해줘야 하기 때문에 train 중간일 때 에폭을 세자<br>
-> 아 몰랑 Adam 쓸래

In [33]:
from torch.optim.lr_scheduler import LambdaLR, MultiStepLR

In [34]:
from torch.utils.data import Dataset, DataLoader

Next, we define our loss function. The `CrossEntropyLoss` function calculates both the log softmax as well as the negative log-likelihood of our predictions. 

Our loss function calculates the average loss per token, however by passing the index of the `<pad>` token as the `ignore_index` argument we ignore the loss whenever the target token is a padding token. 

In [36]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

Next, we'll define our training loop. 

First, we'll set the model into "training mode" with `model.train()`. This will turn on dropout (and batch normalization, which we aren't using) and then iterate through our data iterator.

As stated before, our decoder loop starts at 1, not 0. This means the 0th element of our `outputs` tensor remains all zeros. So our `trg` and `outputs` look something like:

$$\begin{align*}
\text{trg} = [<sos>, &y_1, y_2, y_3, <eos>]\\
\text{outputs} = [0, &\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$

Here, when we calculate the loss, we cut off the first element of each tensor to get:

$$\begin{align*}
\text{trg} = [&y_1, y_2, y_3, <eos>]\\
\text{outputs} = [&\hat{y}_1, \hat{y}_2, \hat{y}_3, <eos>]
\end{align*}$$

At each iteration:
- batch로 부터 $X$와 $Y$를 받습니다
- 마지막 배치로 부터 계산된 gradient를 0으로 초기화합니다
- source와 target을 모델에 넣고 output $\hat{Y}$를 받습니다 
- loss function이 2D input과 1D target에서만 작동하므로 우리는 .view로 각각을 flatten해줍니다
- 앞서 언급한 대로 ouput의 첫번째 컬럼을 슬라이싱해서 제거해줍니다
- `loss.backward()`로 gradient를 계산해줍니다
- gradient exploding을 방지하기 위해 clipping을 해줍니다(RNN에서 흔한 이슈)
- optimizer step을 통해 모델의 파라미터들을 업데이트해줍니다
- loss를 전체 런닝에 합쳐줍니다

그러면 우리는 모든 배치에 대한 평균적인 loss를 구할 수 있습니다

In [37]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    check_half = len(iterator) // 2 
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

Our evaluation loop is similar to our training loop, however as we aren't updating any parameters we don't need to pass an optimizer or a clip value.

We must remember to set the model to evaluation mode with `model.eval()`. This will turn off dropout (and batch normalization, if used).

We use the `with torch.no_grad()` block to ensure no gradients are calculated within the block. This reduces memory consumption and speeds things up. 

The iteration loop is similar (without the parameter updates), however we must ensure we turn teacher forcing off for evaluation. This will cause the model to only use it's own predictions to make further predictions within a sentence, which mirrors how it would be used in deployment.

In [38]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

Next, we'll create a function that we'll use to tell us how long an epoch takes.

In [39]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

We can finally start training our model!

At each epoch, we'll be checking if our model has achieved the best validation loss so far. If it has, we'll update our best validation loss and save the parameters of our model (called `state_dict` in PyTorch). Then, when we come to test our model, we'll use the saved parameters used to achieve the best validation loss. 

We'll be printing out both the loss and the perplexity at each epoch. It is easier to see a change in perplexity than a change in loss as the numbers are much bigger.

In [40]:
# optimizer = optim.SGD(model.parameters(), lr=0.7)
# scheduler = MultiStepLR(optimizer, milestones=list(np.arange(5, 10, 0.5)), gamma=0.5)
scheduler = optim.Adam(model.parameters())

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
        
    scheduler.step()
#     print(scheduler.get_lr())
    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s ')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 11s 
	Train Loss: 7.235 | Train PPL: 1387.087
	 Val. Loss: 6.395 |  Val. PPL: 598.848
Epoch: 02 | Time: 0m 11s 
	Train Loss: 5.551 | Train PPL: 257.434
	 Val. Loss: 5.319 |  Val. PPL: 204.119
Epoch: 03 | Time: 0m 12s 
	Train Loss: 5.068 | Train PPL: 158.907
	 Val. Loss: 5.168 |  Val. PPL: 175.635
Epoch: 04 | Time: 0m 11s 
	Train Loss: 4.923 | Train PPL: 137.456
	 Val. Loss: 5.187 |  Val. PPL: 178.903
Epoch: 05 | Time: 0m 10s 
	Train Loss: 4.878 | Train PPL: 131.375
	 Val. Loss: 5.147 |  Val. PPL: 171.952
Epoch: 06 | Time: 0m 11s 
	Train Loss: 4.849 | Train PPL: 127.640
	 Val. Loss: 4.997 |  Val. PPL: 147.926
Epoch: 07 | Time: 0m 11s 
	Train Loss: 4.793 | Train PPL: 120.699
	 Val. Loss: 4.896 |  Val. PPL: 133.767
Epoch: 08 | Time: 0m 11s 
	Train Loss: 4.757 | Train PPL: 116.374
	 Val. Loss: 4.850 |  Val. PPL: 127.755
Epoch: 09 | Time: 0m 11s 
	Train Loss: 4.734 | Train PPL: 113.789
	 Val. Loss: 4.827 |  Val. PPL: 124.880
Epoch: 10 | Time: 0m 11s 
	Train Loss: 4.718 

We'll load the parameters (`state_dict`) that gave our model the best validation loss and run it the model on the test set.

In [41]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.799 | Test PPL: 121.337 |


In the following notebook we'll implement a model that achieves improved test perplexity, but only uses a single layer in the encoder and the decoder.