# Welcome to torch study without torchtext
## 1월 3주차 : Convolutional Sentiment Analysis
논문 디테일 구현해보기
- load pretrained word embedding
- unk token initialize with uniform distribution
- ada-delta optimizer 
- L2 weight norm

In [15]:
import re
import torch
from torch.utils.data import DataLoader, Dataset
import random
import numpy as np
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
import platform

In [16]:
torch.__version__

'1.7.1'

In [17]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

https://github.com/yoonkim/CNN_sentence 레파지토리 clone 한 경로 

In [18]:
if 'Windows' in platform.platform():
    path = 'C:/Users/long8v'
else:
    path = '/home/long8v'

# 

## 사용자 Dataset정의
- I/O 
- preprocess
- tokenizer
- Vocab 객체 만들기<br>
   = vocab_len<br>
   = stoi<br>
   = itos
- k-fold split

In [19]:
from collections import defaultdict

In [276]:
# 규민님 코드 
# tokenizer가 변경되면 Vocab도 변경되어야하니까 tokenzier 부분은 Dataset에서 하는게 맞지 않을까?
# Vocab에서 build vocab하는건 토큰화된 문장들이 들어있는 이중리스트가 되어야 나중에 편할 것 같음!
class Vocab:    
    def build_vocabs(self, sentence_list):
        from collections import defaultdict
        self.stoi_dict = defaultdict(lambda: 0)
        self.stoi_dict['<UNK>'] = 0
        self.stoi_dict['<PAD>'] = 1
        _index = 2
        for sentence in sentence_list:
            tokens_list = sentence
            for word in tokens_list:
                if word in self.stoi_dict:
                    pass
                else:
                    self.stoi_dict[word] = _index
                    _index += 1
        self.itos_dict = {v:k for k, v in self.stoi_dict.items()}
        
    def stoi(self, token_list):
#         if type(sentence) == str: # sentence 한 개 가 들어온 경우
        return [self.stoi_dict[word] for word in token_list]
#         elif type(sentence) == list: # sentence 여러 개가 리스트로 들어온 경우
#             return [self.stoi(i) for i in sentence]

    def itos(self, indices):
#         if type(indices[0]) == int : # sentence 한 개가 들어온 경우, 공백으로 join해서 문장으로 만들어줌
        return " ".join([self.itos_dict[index] for index in indices if self.itos_dict[index] != '<PAD>'])
#         elif type(indices) == list: # sentence 여러 개가 들어온 경우, 공백으로 join한 문장 리스트를 만들어줌
#             return [self.itos(i) for i in indices]

In [280]:
from sklearn.model_selection import KFold

In [294]:
# #added some parameters
# kf = KFold(n_splits = 2, shuffle = True, random_state = 2)
# result = list(kf.split(df))[0]
# print(result)

# train = df.iloc[result[0]]
# test =  df.iloc[result[1]]

In [432]:
class CNNDataset: # 굳이 Dataset 상속을 안해줘도 된다고 함
    def __init__(self, path, kfold=10):
        ## I/O 하기
        with open(f'{path}/CNN_sentence/rt-polarity.pos', 'r', encoding = "ISO-8859-1") as f:
            pos = f.readlines()
        with open(f'{path}/CNN_sentence/rt-polarity.neg', 'r', encoding = "ISO-8859-1") as f:
            neg = f.readlines()
        pos = [(p, 1) for p in pos]
        neg = [(n, 0) for n in neg]
        data = pos + neg
        zipped_data = list(zip(*data))
        
        # 전처리하는 과정 __getitem__에서 안 한 이유는 vocab 만들때 같은 전처리를 사용해야해서..!!
        self.text = zipped_data[0]
        self.text = [self.clean_str(sen) for sen in self.text]
        self.text = [[word for word in self.tokenizer(sen)] for sen in self.text]
        self.label = zipped_data[1]
        
        # vocab 만들기 -> class 안에 다른 class instance를 정의하는게 보편적인지는 잘 모르겠음
        self.vocab = Vocab()
        self.vocab.build_vocabs(self.text)    
        self.pretrained_embedding = self.get_pretrained_embeddings()

    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        sample_label = self.label[idx]
        sample_text = self.text[idx]
        sample_text = self.vocab.stoi(sample_text)
        return torch.Tensor(sample_text).long(), sample_label
    
    def tokenizer(self, sentence):
        return sentence.split()
    
    def get_pretrained_embeddings(self):
        from gensim.models import KeyedVectors
        w2v = KeyedVectors.load_word2vec_format(f'{path}/Downloads/GoogleNews-vectors-negative300.bin.gz', 
                                        binary=True)
        pretrained_embedding = []
        for word in self.vocab.stoi_dict:
            if word in w2v:
                pretrained_embedding.append(w2v[word])
            else: 
                pretrained_embedding.append(np.random.uniform(-0.25, 0.25, 300))
        return torch.from_numpy(np.array(pretrained_embedding))        
    
    def clean_str(self, string, TREC=False):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Every dataset is lower cased except for TREC
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
        string = re.sub(r"\'s", " \'s", string) 
        string = re.sub(r"\'ve", " \'ve", string) 
        string = re.sub(r"n\'t", " n\'t", string) 
        string = re.sub(r"\'re", " \'re", string) 
        string = re.sub(r"\'d", " \'d", string) 
        string = re.sub(r"\'ll", " \'ll", string) 
        string = re.sub(r",", " , ", string) 
        string = re.sub(r"!", " ! ", string) 
        string = re.sub(r"\(", " \( ", string) 
        string = re.sub(r"\)", " \) ", string) 
        string = re.sub(r"\?", " \? ", string) 
        string = re.sub(r"\s{2,}", " ", string)     
        return string.strip() if TREC else string.strip().lower()

In [433]:
string = 'asdf aslk'
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
string

'asdf aslk'

In [434]:
train_ds = CNNDataset(path)

In [435]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [436]:
for data, label in dataset:
    print(data)
    print(label)
    print(dataset.vocab.itos(data))
    break

[2, 3, 4, 5, 6, 7, 2, 8, 9, 10, 11, 12, 13, 14, 15, 10, 16, 6, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
1
the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal


In [437]:
pad_sequence([torch.Tensor([1,2]),torch.Tensor([3,4,5])], padding_value=0, )

tensor([[1., 3.],
        [2., 4.],
        [0., 5.]])

이쯤 되니까 torchtext를 그냥 사용하는 것도 나쁘지 않다는 생각이 들었음..<br>
min_df가 있으면 이제 또 defaultdict(int)해갖고 해야될텐데...

In [438]:
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    return xx_pad, yy

In [439]:
# def pad_collate(batch):
#     (x, y) = zip(*batch)
#     max_len = max([len(x) for xx in x])
#     x_padded = []
#     for xx in x:
#         if len(xx) < max_len:
#             xx += [0 for _ in range(max_len - len(xx))]
#         x_padded.append(xx)
#     return x_padded, y

In [442]:
train_dl = DataLoader(train_ds, batch_size=16, collate_fn=pad_collate, drop_last=False)

In [443]:
for x, y in train_dl:
    print(x)
    print(y)
    break

tensor([[  2,   3,   4,   5,   6,   7,   2,   8,   9,  10,  11,  12,  13,  14,
          15,  10,  16,   6,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
          27,  28,  29,  30,  31,  32,   0,   0,   0,   0],
        [  2,  33,  34,  35,  36,   2,  37,  36,   2,  38,  39,   4,  40,  41,
          14,  18,  42,  36,  43,  44,  45,  46,  47,  48,  49,  50,  51,  10,
          52,  53,  36,  54,  55,  55,  56,  10,  57,  58],
        [ 59,  60,  61,  62,  63,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [ 64,  65,  66,  67,   6,  68,   6,   2,  69,   6,  70,  71,  25,  72,
           4,  18,  73,  74,   6,  75,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [ 76,  77,  78,  79,  25,  80,  81,  82,  14,  10,  40,  83,  13,  84,
          85,  14,  86,  87,  88,  89,  67,  90, 

## Build the Model

CNN은 보통 이미지에서 많이 사용된다. 이미지는 보통 가로, 세로로 2 차원이다.(RGB 차원은 추후에 논의). 그에 반해 text는 1차원이다. 하지만 우리는 단어를 word embedding을 통해 차원을 늘린다. 그래서 우리가 단어를 2차원으로 보는 이유다. 

우리는 [ n x emb_dim ]인 filter를 사용하게 된다. 이것은 n개의 연속적인 단어를 커버하고, 우리의 너비는 emb_dim이 되게된다. 두개의 단어를 한번에 보는 필터는(=bi-grams) [ 2 x emb_dim ] 필터가 될 것이다.
필터는 이미지의 아래로 내려가면서 bi-gram을 커버하고 결과가 계산된다. 결과의 output vector는 이미지의 높이 - 필터의 높이 + 1 만큼 되게 된다.

이 예시는 하나의 필터가 어떻게 계산하는지를 보여준다. 그러나 우리의 모델은 이러한 필터를 여러개 사용하게 된다. 주요 아이디어는 각각의 필터가 다른 피쳐를 뽑는다는 것이다. 우리의 모델에서는 다른 크기의 필터를 쓸 것이다. 높이 3, 4, 5의 필터를 각각 100개씩 사용할 것이다. 이를 통해 tri-gram, 4-gram, 5-gram을 사용하는 효과를 가졌으면 좋겠다.

다음 단계는 pooling을 하는 과정이다. 이것은 각각의 단어 벡터에서 평균을 구한 FastText와 비슷하다. 그러나 우리는 대신 max value를 구할 것이다. 

In [472]:
class CNN(nn.Module):
    def __init__(self, pretrained_embedding, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
                
        self.static_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.nonstatic_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.static_embedding.from_pretrained(pretrained_embedding.clone().detach())
        self.nonstatic_embedding.from_pretrained(pretrained_embedding.clone().detach(), 
                                                freeze=False)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text = [batch size, sent len]
        ## static embedding 
        embedded = self.nonstatic_embedding(text)
#         print(f'|embedded_shape| {embedded.shape}')
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]            
        return self.fc(cat)

We can also implement the above model using 1-dimensional convolutional layers, where the embedding dimension is the "depth" of the filter and the number of tokens in the sentence is the width.

We'll run our tests in this notebook using the 2-dimensional convolutional model, but leave the implementation for the 1-dimensional model below for anyone interested. 

We create an instance of our `CNN` class. 

We can change `CNN` to `CNN1d` if we want to run the 1-dimensional convolutional model, noting that both models give almost identical results.

In [473]:
INPUT_DIM = len(train_ds.vocab.stoi_dict)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = train_ds.vocab.stoi_dict['<PAD>']
pretrained_vector = train_ds.pretrained_embedding

model = CNN(pretrained_vector, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, 
            FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

Checking the number of parameters in our model we can see it has about the same as the FastText model. 

Both the `CNN` and the `CNN1d` models have the exact same number of parameters.

In [474]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,874,001 trainable parameters


Next, we'll load the pre-trained embeddings

In [475]:
# pretrained_embeddings = TEXT.vocab.vectors

# model.embedding.weight.data.copy_(pretrained_embeddings)

Then zero the initial weights of the unknown and padding tokens.

## Train the Model

Training is the same as before. We initialize the optimizer, loss function (criterion) and place the model and criterion on the GPU (if available)

In [476]:
import torch.optim as optim
device = 'cpu'
optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

We implement the function to calculate accuracy...

In [477]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, max_pool1d torch.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

We define a function for training our model...

**Note**: as we are using dropout again, we must remember to use `model.train()` to ensure the dropout is "turned on" while training.

In [478]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        predictions = model(batch[0]).squeeze(1)
        
        loss = criterion(predictions, torch.Tensor(batch[1]))
        
        acc = binary_accuracy(predictions, torch.Tensor(batch[1]))
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We define a function for testing our model...

**Note**: again, as we are now using dropout, we must remember to use `model.eval()` to ensure the dropout is "turned off" while evaluating.

In [479]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch[0]).squeeze(1)

            loss = criterion(predictions, torch.Tensor(batch[1]))
            
            acc = binary_accuracy(predictions, torch.Tensor(batch[1]))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Let's define our function to tell us how long epochs take.

In [480]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Finally, we train our model...

In [481]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dl, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

NameError: name 'valid_iterator' is not defined

We get test results comparable to the previous 2 models!

In [None]:
model.load_state_dict(torch.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

## User Input

And again, as a sanity check we can check some input sentences

**Note**: As mentioned in the implementation details, the input sentence has to be at least as long as the largest filter height used. We modify our `predict_sentiment` function to also accept a minimum length argument. If the tokenized input sentence is less than `min_len` tokens, we append padding tokens (`<pad>`) to make it `min_len` tokens.

In [None]:
import spacy
tokenizer = lambda e: e.split()

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok for tok in tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

An example negative review...

In [None]:
predict_sentiment(model, "This film is terrible")

An example positive review...

In [None]:
predict_sentiment(model, "This film is great")