# 1. Preparations

### 1-1. Import Libraries
- 데이터셋 다운로드와 전처리를 쉽게 하는 torchtext 라이브러리를 import 합니다.


In [None]:
import os
import random
import time
import sys

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data, datasets
import random
import time
import spacy
import numpy as np
from torch import Tensor

### 1-2. Load data
- Field 를 정의합니다.
- IMDB 데이터를 다운받습니다.
- Train,valid,test 데이터셋으로 split 합니다.

In [None]:
TEXT = data.Field(tokenize = 'spacy', include_lengths= True)
## HW ##
# 가변 길이가 아닌 고정된 길이의 seqeunce 로 데이터를 입력받을 수 있도록
# Data field 를 다시 initialize 해주자

# MAX_LEN = 10
# MAX_LEN = 20
# MAX_LEN = 30

# TEXT = data.Field(tokenize = 'spacy', fix_length = MAX_LEN)


LABEL = data.LabelField(dtype = torch.float) 

In [None]:
# Download IMDB data (about 3mins)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.1MB/s]


In [None]:
# Set the random seed
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
# Split train and valid data
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [None]:
print('Number of training examples: {}'.format(len(train_data)))
print('Number of validation examples: {}'.format(len(valid_data)))
print('Number of testing examples: {}'.format(len(test_data)))

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


### 1-3. Cuda Setup
- GPU 사용을 위한 Cuda 설정
- Colab 페이지 상단 메뉴>수정>노트설정에서 GPU 사용 설정이 선행되어야 합니다.


In [None]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda:0" if USE_CUDA else "cpu")

In [None]:
!nvidia-smi

Fri Nov 13 14:49:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    25W / 250W |     10MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

##2. Preprocess data
- Vocab (단어장) 을 만듭니다.
- Iterator 를 만듭니다. (Iterator 를 통해 batch training 을 위한 batching 과 padding, 그리고 데이터 내 단어들의 인덱스 변환이 이루어집니다.)  

In [None]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_                 
                 )
LABEL.build_vocab(train_data)


.vector_cache/glove.6B.zip: 862MB [06:33, 2.19MB/s]                           
100%|█████████▉| 399431/400000 [00:16<00:00, 22591.83it/s]

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [None]:
# Batching - construct iterator
BATCH_SIZE = 32   
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_within_batch = True,
    device = device)

##3. Build Model
- Embedding layer, Transformer layer, Fully-connected layer 로 이루어진 모델을 만듭니다.
- Classification task 에 활용하기 위해 기존 Seq2Seq Transformer 를 변형하여, Transformer Encoder 만을 활용합니다.
- Positional Encoding 식
> $PE_(pos,2i) =sin(pos/10000^{2i/d_{model}})$  
> $PE_(pos,2i+1) =cos(pos/10000^{2i/d_{model}})$








In [None]:
class PositionalEnc(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class TransformerNet(nn.Module):    ## HW: TransformerNet 클래스에 추가되어야 할 파라미터를 한개 더 적어보자.
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_heads, n_layers, dropout, pad_idx):
    # def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_heads, n_layers, dropout, pad_idx, length = MAX_LEN):
        super().__init__()

        # Define parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # Define Layers
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)

        # Positional encoding layer
        self.pe = PositionalEnc(embedding_dim) 
        # self.pe = PositionalEnc(embedding_dim, max_len=length)    ## HW: PositionalEnc 클래스의 max_len 에 전달될 값을 적어주자.

        # Encoder layer
        enc_layer = nn.TransformerEncoderLayer(embedding_dim, n_heads, hidden_dim, dropout=dropout)
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers = n_layers)
      
        # Fully connected layer
        self.fc = nn.Linear(embedding_dim, output_dim)
        # self.fc = nn.Linear(embedding_dim*length, output_dim)  ## HW: Fully-connected layer 의 input dimension 값을 다시 적어주자.

        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):

        # text = [sent len, batch size]
        
        embedded = self.embedding(text)
        # embedded = [sent len, batch size, emb dim]

        pos_encoded = self.pe(embedded)
        trans_out = self.encoder(pos_encoded)
        # trans_out = [sent len, batch_size, emb_dim]

        pooled = trans_out.mean(0)
        final = self.fc(self.dropout(pooled))

        ## TO-DO ##
        # Transformer 로 encoding 한 sequence 로부터, 한개의 확률값을 얻어낼 다른 방법을 구현해보자.

        # 1) Prediction 을 위한 1개의 vector 로서 마지막 token 의 representation 을 취해보자. 

        # last = trans_out[-1,:] 
        # final = self.fc(self.dropout(last))
  

        # 2) HOMEWORK: Transformer 출력의 모든 값으로부터 곧바로 1개의 scalar 값을 계산하는 fc layer 를 만들어보자.
        # batch = trans_out.shape[1]
        # trans_re = trans_out.permute(1,0,2).reshape(batch,-1)                     # Permute, Reshape trans_out
        # final = self.fc(trans_re)        


        
        return final



In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_HEADS = 2  #embedding_dim must be divisible by n_heads
N_LAYERS = 1
DROPOUT = 0.5

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]


model = TransformerNet(INPUT_DIM,      
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_HEADS,
            N_LAYERS, 
            DROPOUT,
            PAD_IDX)


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has {:,} trainable parameters'.format(count_parameters(model)))

# load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings);

The model has 2,566,929 trainable parameters


## 4. Train model

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model= model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
              
        predictions = model(batch.text[0]).squeeze(1)
        # predictions = model(batch.text).squeeze(1)   ## HOMEWORK: 데이터 입력이 고정 길이로 바뀌었을때의 prediction 값을 다시 써보자.
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            predictions = model(batch.text[0]).squeeze(1)
            # predictions = model(batch.text).squeeze(1)  ## HOMEWORK: 데이터 입력이 고정 길이로 바뀌었을때의 prediction 값을 다시 써보자.
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


### *Do Training!*

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'trans-model.pt')

    print('Epoch: {:02}'.format(epoch+1))
    print('\tTrain Loss: {:.3f} | Train Acc: {:.2f}%'.format(train_loss, train_acc*100))
    print('\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%'.format(valid_loss, valid_acc*100))

Epoch: 01
	Train Loss: 0.514 | Train Acc: 72.17%
	 Val. Loss: 0.372 |  Val. Acc: 84.89%
Epoch: 02
	Train Loss: 0.317 | Train Acc: 86.75%
	 Val. Loss: 0.342 |  Val. Acc: 86.82%
Epoch: 03
	Train Loss: 0.236 | Train Acc: 90.83%
	 Val. Loss: 0.365 |  Val. Acc: 87.98%
Epoch: 04
	Train Loss: 0.177 | Train Acc: 93.23%
	 Val. Loss: 0.442 |  Val. Acc: 88.27%
Epoch: 05
	Train Loss: 0.128 | Train Acc: 95.30%
	 Val. Loss: 0.443 |  Val. Acc: 88.40%


In [None]:
model.load_state_dict(torch.load('trans-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print('Test Loss: {:.3f} | Test Acc: {:.2f}%'.format(test_loss, test_acc*100))

Test Loss: 0.349 | Test Acc: 86.75%


## 5. Test model
우리가 직접 예문을 작성해서 트레인된 모델에서 예문을 어떻게 평가하는지 확인합니다.



In [None]:
# 토크나이저로 spacy 를 사용합니다.
nlp = spacy.load('en')

# 사용자가 입력한 sentence 를 훈련된 모델에 넣었을때의 결과값을 확인합니다.
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  # Tokenization
    print(tokenized)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]   # 위에서 만든 vocab 에 부여된 index 로 indexing
    # indexed = [indexed[i] if i<len(indexed) else TEXT.vocab.stoi['<PAD>'] for i in range(MAX_LEN) ] # 사용자가 직접 입력한 문장의 길이가 MAX_LEN 에 못 미칠때 나머지 부분을 패딩해주는 코드
    print(indexed)
    tensor = torch.LongTensor(indexed).to(device)   # indexing 된 sequence 를 torch tensor 형태로 만들어줌.
    print(tensor.shape)
    tensor = tensor.unsqueeze(1)   # 입력 텐서에 batch 차원을 만들어줌.
    prediction = torch.sigmoid(model(tensor))  # 모델에 입력한 후 확률값 도출을 위한 sigmoid 적용 
    return prediction.item() # prediction 값 출력

In [None]:
predict_sentiment(model, "This film is terrible") #아주 낮은 값의 확률이 도출되는 것을 확인할 수 있습니다.(부정)

['This', 'film', 'is', 'terrible']
[66, 24, 9, 447]
torch.Size([4])


0.0018361395923420787

In [None]:
predict_sentiment(model, "This film is great") #아주 높은 값의 확률이 도출되는 것을 확인할 수 있습니다. (긍정)

['This', 'film', 'is', 'great']
[66, 24, 9, 103]
torch.Size([4])


0.9985785484313965