# Welcome to torch study without torchtext
## 1월 3주차 : Convolutional Sentiment Analysis
논문 디테일 구현해보기
- load pretrained word embedding (v)
- unk token initialize with uniform distribution (v)
- ada-delta optimizer (v)
- L2 weight norm (v)
- L2 weight constraint (v)
- multi-channel model (v)
- K-fold (v)

In [3]:
import re
import torch
from torch.utils.data import DataLoader, Dataset
import random
import numpy as np
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
import platform

In [4]:
torch.cuda.is_available() 
device = 'cuda: 0'

In [5]:
torch.__version__

'1.7.1'

In [6]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

https://github.com/yoonkim/CNN_sentence 레파지토리 clone 한 경로 

In [7]:
if 'Windows' in platform.platform():
    path = 'C:/Users/long8v'
else:
    path = '/home/long8v'

## 사용자 Dataset정의
- I/O 
- preprocess
- tokenizer
- Vocab 객체 만들기<br>
   = vocab_len<br>
   = stoi<br>
   = itos
- k-fold split

In [8]:
from collections import defaultdict

In [9]:
# 규민님 코드 
# tokenizer가 변경되면 Vocab도 변경되어야하니까 tokenzier 부분은 Dataset에서 하는게 맞지 않을까?
# Vocab에서 build vocab하는건 토큰화된 문장들이 들어있는 이중리스트가 되어야 나중에 편할 것 같음!
class Vocab:    
    def build_vocabs(self, sentence_list):
        from collections import defaultdict
        self.stoi_dict = defaultdict(lambda: 0) # 원래 <UNK>로 되어있었음
        self.stoi_dict['<UNK>'] = 0
        self.stoi_dict['<PAD>'] = 1
        _index = 2
        for sentence in sentence_list:
            tokens_list = sentence
            for word in tokens_list:
                if word in self.stoi_dict:
                    pass
                else:
                    self.stoi_dict[word] = _index
                    _index += 1
        self.itos_dict = {v:k for k, v in self.stoi_dict.items()}
        
    def stoi(self, token_list):
#         if type(sentence) == str: # sentence 한 개 가 들어온 경우
        return [self.stoi_dict[word] for word in token_list]
#         elif type(sentence) == list: # sentence 여러 개가 리스트로 들어온 경우
#             return [self.stoi(i) for i in sentence]

    def itos(self, indices):
#         if type(indices[0]) == int : # sentence 한 개가 들어온 경우, 공백으로 join해서 문장으로 만들어줌
        return " ".join([self.itos_dict[index] for index in indices if self.itos_dict[index] != '<PAD>'])
#         elif type(indices) == list: # sentence 여러 개가 들어온 경우, 공백으로 join한 문장 리스트를 만들어줌
#             return [self.itos(i) for i in indices]

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
## I/O 하기
with open(f'{path}/CNN_sentence/rt-polarity.pos', 'r', encoding = "ISO-8859-1") as f:
    pos = f.readlines()
with open(f'{path}/CNN_sentence/rt-polarity.neg', 'r', encoding = "ISO-8859-1") as f:
    neg = f.readlines()
pos = [(p, 1) for p in pos]
neg = [(n, 0) for n in neg]
data = pos + neg

In [12]:
# train, valid = train_test_split(data)

## OOV

In [13]:
from gensim.models import KeyedVectors
# 이거 load하는데 너무 오래걸려서 객체 가져온 다음에 train, valid할 때 재활용하는게 나을듯하다
w2v = KeyedVectors.load_word2vec_format(f'{path}/Downloads/GoogleNews-vectors-negative300.bin.gz', 
                                binary=True)

In [14]:
# vocab = Vocab()
# vocab.build_vocabs(text) 

In [15]:
class CNNDataset: # 굳이 Dataset 상속을 안해줘도 된다고 함
    def __init__(self, path, w2v):
        data = self.load_data(path)
        zipped_data = list(zip(*data))
        
        # 전처리하는 과정 __getitem__에서 안 한 이유는 vocab 만들때 같은 전처리를 사용해야해서..!!
        self.text = zipped_data[0]
        self.text = [self.clean_str(sen) for sen in self.text]
        self.text = [[word for word in self.tokenizer(sen)] for sen in self.text]
        self.label = zipped_data[1]
        
        # vocab 만들기 -> class 안에 다른 class instance를 정의하는게 보편적인지는 잘 모르겠음
        # ...이렇게 하면 문제점이 생기는게, train, valid, test 따로따로 build_vocab을 만들어서 안됨!!! 어떡하지
        self.vocab = Vocab()
        self.vocab.build_vocabs(self.text)    
        self.pretrained_embedding = self.get_pretrained_embeddings()
        self.w2v = w2v

    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        sample_label = self.label[idx]
        sample_text = self.text[idx]
        sample_text = self.vocab.stoi(sample_text)
        return torch.Tensor(sample_text).long(), sample_label
    
    def load_data(self, path):
        ## I/O 하기
        with open(f'{path}/CNN_sentence/rt-polarity.pos', 'r', encoding = "ISO-8859-1") as f:
            pos = f.readlines()
        with open(f'{path}/CNN_sentence/rt-polarity.neg', 'r', encoding = "ISO-8859-1") as f:
            neg = f.readlines()
        pos = [(p, 1) for p in pos]
        neg = [(n, 0) for n in neg]
        return pos + neg
    
    def tokenizer(self, sentence):
        return sentence.split()
    
    def get_pretrained_embeddings(self):
        pretrained_embedding = []
        for word in self.vocab.stoi_dict:
            if word in w2v:
                pretrained_embedding.append(w2v[word])
            else: 
                pretrained_embedding.append(np.random.uniform(-0.25, 0.25, 300))
        return torch.from_numpy(np.array(pretrained_embedding))        
    
    def clean_str(self, string, TREC=False):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Every dataset is lower cased except for TREC
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
        string = re.sub(r"\'s", " \'s", string) 
        string = re.sub(r"\'ve", " \'ve", string) 
        string = re.sub(r"n\'t", " n\'t", string) 
        string = re.sub(r"\'re", " \'re", string) 
        string = re.sub(r"\'d", " \'d", string) 
        string = re.sub(r"\'ll", " \'ll", string) 
        string = re.sub(r",", " , ", string) 
        string = re.sub(r"!", " ! ", string) 
        string = re.sub(r"\(", " \( ", string) 
        string = re.sub(r"\)", " \) ", string) 
        string = re.sub(r"\?", " \? ", string) 
        string = re.sub(r"\s{2,}", " ", string)     
        return string.strip() if TREC else string.strip().lower()

In [16]:
dataset = CNNDataset(path, w2v)

In [17]:
for data, label in dataset:
    print(data)
    print(label)
    print(dataset.vocab.itos(np.array(data)))
    break

tensor([ 2,  3,  4,  5,  6,  7,  2,  8,  9, 10, 11, 12, 13, 14, 15, 10, 16,  6,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])
1
the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal


# k-fold

In [18]:
from torch.utils.data.dataset import Subset

In [19]:
Subset(dataset, [1, 2,3,4])

<torch.utils.data.dataset.Subset at 0x7f13058ccfd0>

In [20]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split
from sklearn.model_selection import KFold

In [35]:
len(kf_list_index[0][0]), len(kf_list_index[0][1])

(9595, 1067)

In [66]:
kf = KFold(n_splits=10, shuffle=True)
kf_splitted = kf.split(dataset)
kf_list_index = list(kf_splitted)
kf_folded = [(Subset(dataset, train_idx), Subset(dataset, test_idx)) 
             for train_idx, test_idx in kf_list_index]

In [67]:
kf_list_index

[(array([    1,     2,     3, ..., 10657, 10660, 10661]),
  array([    0,    15,    18, ..., 10656, 10658, 10659])),
 (array([    0,     1,     2, ..., 10657, 10658, 10659]),
  array([    8,    12,    13, ..., 10640, 10660, 10661])),
 (array([    0,     1,     2, ..., 10659, 10660, 10661]),
  array([   11,    51,    58, ..., 10641, 10647, 10657])),
 (array([    0,     1,     2, ..., 10659, 10660, 10661]),
  array([    6,    16,    19, ..., 10635, 10646, 10648])),
 (array([    0,     1,     2, ..., 10659, 10660, 10661]),
  array([   10,    17,    20, ..., 10617, 10628, 10637])),
 (array([    0,     1,     2, ..., 10659, 10660, 10661]),
  array([   22,    41,    44, ..., 10650, 10652, 10654])),
 (array([    0,     1,     2, ..., 10659, 10660, 10661]),
  array([    5,     7,    54, ..., 10627, 10644, 10651])),
 (array([    0,     1,     2, ..., 10659, 10660, 10661]),
  array([    4,    28,    31, ..., 10636, 10645, 10653])),
 (array([    0,     2,     3, ..., 10659, 10660, 10661]),
  arra

In [65]:
train_ds, valid_ds = kf_folded[0][0], kf_folded[0][1]

### 나중에 패키지화 하면 해봐야지 지금은 복잡해서 못하겠음

이쯤 되니까 torchtext를 그냥 사용하는 것도 나쁘지 않다는 생각이 들었음..<br>
min_df가 있으면 이제 또 defaultdict(int)해갖고 해야될텐데...

In [44]:
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    return xx_pad, yy

In [21]:
train_ds, valid_ds = random_split(dataset, [int(len(dataset)*0.9), len(dataset) - int(len(dataset)*0.9)])

In [45]:
bs = 50

In [46]:
train_dl = DataLoader(train_ds, batch_size=bs, collate_fn=pad_collate, drop_last=False)
valid_dl = DataLoader(valid_ds, batch_size=bs, collate_fn=pad_collate, drop_last=False)

## Build the Model

CNN은 보통 이미지에서 많이 사용된다. 이미지는 보통 가로, 세로로 2 차원이다.(RGB 차원은 추후에 논의). 그에 반해 text는 1차원이다. 하지만 우리는 단어를 word embedding을 통해 차원을 늘린다. 그래서 우리가 단어를 2차원으로 보는 이유다. 

우리는 [ n x emb_dim ]인 filter를 사용하게 된다. 이것은 n개의 연속적인 단어를 커버하고, 우리의 너비는 emb_dim이 되게된다. 두개의 단어를 한번에 보는 필터는(=bi-grams) [ 2 x emb_dim ] 필터가 될 것이다.
필터는 이미지의 아래로 내려가면서 bi-gram을 커버하고 결과가 계산된다. 결과의 output vector는 이미지의 높이 - 필터의 높이 + 1 만큼 되게 된다.

이 예시는 하나의 필터가 어떻게 계산하는지를 보여준다. 그러나 우리의 모델은 이러한 필터를 여러개 사용하게 된다. 주요 아이디어는 각각의 필터가 다른 피쳐를 뽑는다는 것이다. 우리의 모델에서는 다른 크기의 필터를 쓸 것이다. 높이 3, 4, 5의 필터를 각각 100개씩 사용할 것이다. 이를 통해 tri-gram, 4-gram, 5-gram을 사용하는 효과를 가졌으면 좋겠다.

다음 단계는 pooling을 하는 과정이다. 이것은 각각의 단어 벡터에서 평균을 구한 FastText와 비슷하다. 그러나 우리는 대신 max value를 구할 것이다. 

In [47]:
torch.Tensor([1,2,3,4]).add_(torch.Tensor([1,2,3,4]))

tensor([2., 4., 6., 8.])

## multi-channel model

In [48]:
class CNN(nn.Module):
    def __init__(self, pretrained_embedding, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        # we experiment with having two 'channels' of word vector
        # ... each filter is applied to calculate c_i
        # ... and the results are added to cacluate c_i
        self.static_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.nonstatic_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.static_embedding.from_pretrained(pretrained_embedding.clone().detach())
        self.nonstatic_embedding.from_pretrained(pretrained_embedding.clone().detach(), 
                                                 max_norm=3.0, freeze=False)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout()
#         self.softmax = nn.Softmax(dim=1)
        
    def forward(self, text):
        # text = [batch size, sent len]
        ## static embedding 
        embedded = self.nonstatic_embedding(text)
        embedded_static = self.static_embedding(text)
#         print(f'|embedded_shape| {embedded.shape}')
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        embedded_static = embedded_static.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) + F.relu(conv(embedded_static)).squeeze(3) 
                  for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] 
        cat = torch.cat(pooled, dim = 1)
        #cat = [batch size, n_filters * len(filter_sizes)] 
        output = self.dropout(cat)
        output = self.fc(output)
        # ouput [batch size, output_dim]
        return output

We can also implement the above model using 1-dimensional convolutional layers, where the embedding dimension is the "depth" of the filter and the number of tokens in the sentence is the width.

We'll run our tests in this notebook using the 2-dimensional convolutional model, but leave the implementation for the 1-dimensional model below for anyone interested. 

We create an instance of our `CNN` class. 

We can change `CNN` to `CNN1d` if we want to run the 1-dimensional convolutional model, noting that both models give almost identical results.

In [49]:
INPUT_DIM = len(dataset.vocab.stoi_dict)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 2
DROPOUT = 0.5
PAD_IDX = dataset.vocab.stoi_dict['<PAD>']
pretrained_vector = dataset.pretrained_embedding

model = CNN(pretrained_vector, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, 
            FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

Checking the number of parameters in our model we can see it has about the same as the FastText model. 

Both the `CNN` and the `CNN1d` models have the exact same number of parameters.

In [50]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,621,102 trainable parameters


Next, we'll load the pre-trained embeddings

Then zero the initial weights of the unknown and padding tokens.

## Train the Model

Training is the same as before. We initialize the optimizer, loss function (criterion) and place the model and criterion on the GPU (if available)

In [51]:
for idx, a in enumerate([(1,2), (3,4)]):
    print(a)

(1, 2)
(3, 4)


In [52]:
import torch.optim as optim
device = 'cpu'
optimizer = optim.Adam(model.parameters())
optimizer = optim.Adadelta(model.parameters(), rho=0.95)
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

We implement the function to calculate accuracy...

In [53]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, max_pool1d torch.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [54]:
torch.argmax(torch.Tensor([[1,2,3,4],[1,2,3,4]]), dim=1)

tensor([3, 3])

In [55]:
def softmax_accuracy(preds, y):
    #round predictions to the closest integer
    argmax = torch.argmax(preds, dim=1)
    correct = (argmax == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

We define a function for training our model...

**Note**: as we are using dropout again, we must remember to use `model.train()` to ensure the dropout is "turned on" while training.

### `TORCH.CLAMP`
**torch.clamp(input, min, max, *, out=None)** → Tensor
Clamp all elements in input into the range [ min, max ] and return a resulting tensor:
\begin{cases} \text{min} & \text{if } x_i < \text{min} \\ x_i & \text{if } \text{min} \leq x_i \leq \text{max} \\ \text{max} & \text{if } x_i > \text{max} \end{cases}
	
 
If input is of type FloatTensor or DoubleTensor, args min and max must be real numbers, otherwise they should be integers.



In [56]:
for name, param in model.named_parameters():
    max_val = 3
    eps = 1e-12
    if 'fc.weight' in name:
        norm = torch.norm(param, 2, dim=0)
        desired = torch.clamp(norm, 0, max_val)
        param = param * (desired / (eps + norm))

In [57]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        predictions = model(batch[0])

        loss = criterion(predictions, torch.Tensor(batch[1]).long())
        
#         ## l2 weight norm https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch
#         l2_lambda = 0.01
#         l2_reg = torch.tensor(0., requires_grad=False)
#         for param in model.parameters():
#             l2_reg.data.add_(torch.sqrt(torch.norm(param)))
#             loss.add_(l2_lambda * l2_reg)

        acc = softmax_accuracy(predictions, torch.Tensor(batch[1]).long())
        loss.backward()
        optimizer.step()

                
        ## max-norm 
        for name, param in model.named_parameters():
            max_val = 3
            eps = 1e-12
            if 'fc.weight' in name:
                norm = torch.norm(param, 2, dim=0)
                desired = torch.clamp(norm, 0, max_val)
                param.data *= (desired / (eps + norm))
        
    
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We define a function for testing our model...

**Note**: again, as we are now using dropout, we must remember to use `model.eval()` to ensure the dropout is "turned off" while evaluating.

In [58]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            predictions = model(batch[0]).squeeze(1)

            loss = criterion(predictions, torch.Tensor(batch[1]).long())
            
            acc = softmax_accuracy(predictions, torch.Tensor(batch[1]).long())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Let's define our function to tell us how long epochs take.

In [59]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

Finally, we train our model...

## CrossEntropyLoss

In [60]:
>>> loss = nn.CrossEntropyLoss()
>>> input = torch.randn(3, 5, requires_grad=True)
>>> target = torch.empty(3, dtype=torch.long).random_(5)
>>> output = loss(input, target)
>>> output.backward()

In [61]:
target

tensor([0, 2, 0])

In [62]:
input.shape, target.shape

(torch.Size([3, 5]), torch.Size([3]))

In [63]:
N_EPOCHS = 20

best_valid_accuracy = float(0)

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dl, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dl, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_acc > best_valid_accuracy:
        best_valid_accuracy = valid_acc
        torch.save(model.state_dict(), f'{path}/torch_study/data/tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 37s
	Train Loss: 0.056 | Train Acc: 99.00%
	 Val. Loss: 12.209 |  Val. Acc: 0.00%


KeyboardInterrupt: 

We get test results comparable to the previous 2 models!

In [40]:
model.load_state_dict(torch.load(f'{path}/torch_study/data/tut4-model.pt'))

test_loss, test_acc = evaluate(model, valid_dl, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.876 | Test Acc: 65.14%


## User Input

And again, as a sanity check we can check some input sentences

**Note**: As mentioned in the implementation details, the input sentence has to be at least as long as the largest filter height used. We modify our `predict_sentiment` function to also accept a minimum length argument. If the tokenized input sentence is less than `min_len` tokens, we append padding tokens (`<pad>`) to make it `min_len` tokens.

In [41]:
tokenizer = lambda e: e.split()

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok for tok in tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [dataset.vocab.stoi_dict[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.argmax(model(tensor))
    return prediction.item()

An example negative review...

In [42]:
predict_sentiment(model, "This film is terrible")

1

An example positive review...

In [43]:
predict_sentiment(model, "This film is great")

1