# Data Download

In [1]:
from Korpora import Korpora
import pickle

In [2]:
# Korpora.fetch('namuwikitext')

In [3]:
# corpus = Korpora.load('namuwikitext')
# with open('kor.p', 'wb') as f:
#     pickle.dump(corpus, f)

In [4]:
with open('kor.p', 'rb') as f:
    corpus = pickle.load(f)

# data preprocess

In [5]:
import mecab
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim

import torchtext
import sys
sys.path.append('../source')
from txt_cleaner.clean.master import MasterCleaner
from txt_cleaner.utils import *
from torch8text.data import Vocab, Field

## field 1: mecab 사용 field

In [6]:
pos = mecab.MeCab()

def tokenize_pos(inp):
    if type(inp) == str:
        return pos.morphs(inp)
    if type(inp) == list:
        return [tokenize_pos(i) for i in inp]
# pos.morphs(['안녕하세요'])

In [7]:
tokenize_pos('안녕하세요'), tokenize_pos(['안녕하세요', '안녕?'])

(['안녕', '하', '세요'], [['안녕', '하', '세요'], ['안녕', '?']])

### cleaner
https://github.com/tndls9304/nlp_torch_study/tree/master/txt_cleaner

In [8]:
config = json_reader('../source/txt_cleaner/cleaner_config.json')

size 1 dictionary is read from ../source/txt_cleaner/cleaner_config.json


In [9]:
config['minimum_space_count'] = 2
config

{'minimum_space_count': 2}

In [10]:
cleaner = MasterCleaner(config)
cleaner.cleaning('안녕하세요? 반갑습니다! 행복하세요~**')

'안녕하세요? 반갑습니다! 행복하세요'

In [11]:
tokenize_pos(['안녕하세요', '안녕'])

[['안녕', '하', '세요'], ['안녕']]

In [12]:
mecab_field = Field(tokenize = tokenize_pos, 
                 preprocessing = cleaner.cleaning,
                    init_token = False,
                    eos_token = False
                )

In [13]:
train = [text for text in corpus.train.texts if cleaner.cleaning(text)]
mecab_field.build_vocab(train)
mecab_field.preprocess('안녕하세요 룰루랄라 ㅇㅇㄹ')

['안녕', '하', '세요', '룰루랄라']

## field 2:  chr-level field

In [14]:
cleaner.cleaning('아')

''

In [15]:
cleaner = MasterCleaner({'minimum_space_count':0})
chr_field = Field(tokenize = list, 
                 preprocessing = lambda e: cleaner.cleaning(e) if len(e) > 1 else e,
                  init_token = False,
                  eos_token = False,
                )

In [16]:
chr_field.build_vocab(train)

In [17]:
chr_field.process('안녕하세요')

[1]

# dataset, dataloader
## 헷갈리는 부분
bi-directional LSTM을 쓸건데 이게 다음 단어 예측하는 LM만 데이터를 구성하면 되나? 아니면 뒤에서부터 앞의 단어를 예측하는 LM도 구성해서 concat해야 하나? -> 일단 전자라고 생각하고 함

In [38]:
from collections import namedtuple  
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

class ELMoDataset(Dataset):
    def __init__(self, src, mecab_field, chr_field):
        self.src = src
        self.mecab_field = mecab_field
        self.chr_field = chr_field
        self.named_tuple = namedtuple('data', ['src', 'trg', 'src_chr'])
        
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, idx):
        return self.named_tuple(self.getitem(idx), self.getitem(idx)[1:], self.getitem(idx, is_char=True))
    
    def getitem(self, idx, is_char=False):
        data = self.src[idx]
        tokenize_data = self.mecab_field.preprocess(data)
        if is_char:
            chrs = chr_field.preprocess(tokenize_data)
            pad_chrs = self.chr_field.pad_process(tokenize_data, max_len = 3)
            return pad_chrs
        return torch.Tensor(self.mecab_field.vocab.stoi(tokenize_data)).long()

In [39]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [40]:
ds = ELMoDataset(train, mecab_field, chr_field)

In [41]:
max_len = max([len(_) for _ in mecab_field.vocab.stoi_dict])

In [42]:
max_len

6

In [43]:
X  = [torch.tensor([[72,  0,  0,  0,  0]]), torch.tensor([[0, 0, 0, 0, 0]])]

In [44]:
torch.cat(X)

tensor([[72,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0]])

In [45]:
data = '안녕하세요 반갑습니ek edd'
token_data = mecab_field.preprocess(data)
print(token_data)
token_chr_data = chr_field.preprocess(token_data)
print(token_chr_data)
process_chr = chr_field.pad_process(token_chr_data, max_len = 3)
print(process_chr)

['안녕', '하', '세요', '반갑', '습니', 'ek', 'edd']
[['안', '녕'], ['하'], ['세', '요'], ['반', '갑'], ['습', '니'], [], []]
tensor([[ 72,   1,   0],
        [ 18,   0,   0],
        [  1,   1,   0],
        [176,   1,   0],
        [  1, 210,   0]])


In [46]:
process_chr.shape

torch.Size([5, 3])

# ds[1]

In [47]:
for _ in ds:
    print(_.src_chr)
    print(_.src)
    print(_.trg)
    break

tensor([[  1,   0,   0],
        [ 13,   0,   0],
        [  1,   0,   0],
        [ 13,   0,   0],
        [  1,   0,   0],
        [ 69,   0,   0],
        [115,   0,   0],
        [  1,   0,   0],
        [  1,   0,   0],
        [ 16,   0,   0],
        [  1,   0,   0],
        [ 13,   0,   0],
        [  1,   0,   0],
        [ 17,   0,   0],
        [  1,   0,   0],
        [  1,   0,   0],
        [  1,   0,   0],
        [ 49,   0,   0],
        [  6,   0,   0],
        [  9,   0,   0],
        [  1,   0,   0],
        [ 74,   0,   0],
        [  7,   0,   0],
        [122,   0,   0],
        [  1,   0,   0],
        [ 13,   0,   0],
        [  1,   0,   0],
        [ 24,   0,   0],
        [  1,   0,   0],
        [ 14,   0,   0],
        [  1,   0,   0],
        [ 49,   0,   0],
        [  1,   0,   0],
        [ 18,   0,   0],
        [  1,   0,   0],
        [  1,   0,   0],
        [ 25,   0,   0],
        [124,   0,   0],
        [  1,   0,   0],
        [  7,   0,   0],


In [48]:
def pad_collate(batch):
    (src, trg, src_chr) = zip(*batch)
    named_tuple = namedtuple('data', ['src', 'trg', 'src_chr'])
    src_pad = pad_sequence(src, batch_first=True, padding_value=0)
    trg_pad = pad_sequence(trg, batch_first=True, padding_value=0)
    src_chr_pad = pad_sequence(src_chr, batch_first=True, padding_value=0)
    return named_tuple(src_pad, trg_pad, src_chr_pad)

In [49]:
# def pack_pad_collate(batch):
#     (src, trg) = zip(*batch)
#     src_len = torch.Tensor([len(s) for s in src])
#     trg_len = torch.Tensor([len(t) for t in trg])
#     named_tuple = namedtuple('data', ['src', 'trg'])
#     src_pad = pad_sequence(src, batch_first=True, padding_value=0)
#     trg_pad = pad_sequence(trg, batch_first=True, padding_value=0)
#     src_pack = pack_padded_sequence(src_pad, lengths=src_len, batch_first=True, enforce_sorted=False)
#     trg_pack = pack_padded_sequence(trg_pad, lengths=trg_len, batch_first=True, enforce_sorted=False)
#     return named_tuple(src_pack, trg_pack)

In [50]:
dl = DataLoader(ds, batch_size = 16, collate_fn = pad_collate)
for _ in dl:
    print(_.src.data.shape)
    print(_.trg.data.shape)
    print(_.src_chr.data.shape)
    break

torch.Size([2, 243])
torch.Size([2, 242])
torch.Size([2, 243, 3])


In [51]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim)) 
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))

        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
        # torch.Size([2, 243, 5, 1024])
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))

        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
#         print(pooled_0.shape)
        cat = self.dropout(torch.cat((pooled_0, pooled_1), dim = -1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [52]:
VOCAB_SIZE = len(chr_field.vocab)
EMBEDDING_DIM = 1024
N_FILTERS = 1
FILTER_SIZES = (1, 2)
PAD_IDX = chr_field.vocab.stoi_dict['<PAD>']
SPECIAL_TOKENS = chr_field.vocab.special_tokens
SPECIAL_TOKENS_INDEX = chr_field.vocab.special_tokens_idx
CHR_DICT = chr_dict
OUTPUT_DIM = 1024
DROPOUT = 0.5

In [53]:
cnn = CNN(VOCAB_SIZE, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [54]:
class LSTM_LM(nn.Module):
    def __init__(self, input_dim, output_dim, hid_dim, n_layers, dropout, bidirectional):
        super().__init__()
        
        self.output_dim = output_dim
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.num_dircetions = 2 if bidirectional else 1
        
        self.lstm = nn.LSTM(input_dim, hid_dim, n_layers, dropout = dropout, bidirectional = bidirectional)
        
        self.fc_out = nn.Linear(hid_dim * self.num_dircetions, output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    def init_hidden(self):
        # (num_layers * num_directions, batch, hidden_size)
        return torch.zeros(self.n_layers * self.num_dircetions, 1, self.hid_dim)
    
    def forward(self, input):
#         print(f'input shape : {input.shape}') # seqlen, batch, hid_dim(output_dim of cnn)
        output, (hidden, cell) = self.lstm(input)
#         print(f'output shape : {output.shape}') # ouput shape :(seq_len, batch, num_directions * hidden_size)  
        
        prediction = self.fc_out(output)
#         print(f'prediction shape {prediction.shape}') # seq len, batchsize, trg_dim
        return prediction, hidden, cell

In [55]:
OUTPUT_DIM

1024

In [56]:
INPUT_DIM = OUTPUT_DIM
PREDICT_DIM = len(mecab_field.vocab)
HID_DIM = 1024
N_LAYERS = 2
DROPOUT = 0.5
BIDIRECTIONAL = True
TRG_PAD_IDX = mecab_field.vocab.stoi_dict['<PAD>']

In [57]:
rnn = LSTM_LM(INPUT_DIM, PREDICT_DIM, HID_DIM, N_LAYERS, DROPOUT, BIDIRECTIONAL)

`<sos>`토큰이랑 `<eos>` 토큰은 어떻게 CNN처리 해야하지?  -> 일단 빼는걸로 처리

## 한글이랑 영어랑 다른점 : 영어는 3char이하인 단어가 별로 없는데 한글은 1~2개로 많이 끊겨서 conv연산 하기가 애매함 

In [58]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
import torch.optim as optim

optimizer = optim.Adam(cnn.parameters(), lr=0.0005)

In [59]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [60]:
cnn.apply(initialize_weights);
rnn.apply(initialize_weights);

In [65]:
epoch_loss = []
n_pass = 0
for epoch in range(1):
    optimizer.zero_grad()
    for src, trg, src_chr in dl:
        src_chr = src_chr.permute(1, 0, 2) # 토큰 별 캐릭터가 먼저 나오도록
        for idx, src_c in enumerate(src_chr):
            features = cnn(src_c.unsqueeze(1))
            output, hidden, cell = rnn(features.unsqueeze(1))
            try:
                loss = criterion(output.squeeze(1), trg[:, idx])
                loss.backward()
                optimizer.step()
                
            except:
                pass
    epoch_loss += [loss.item()]

In [62]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(epoch_loss)

# 해야 할 일 : 1) init hidden 2) CNN + RNN 감싸기
1) 배치별로 토큰 내에 있는 캐릭터 글자에 따라 CNN길이가 다른데 어떻게 처리하지? -> 패딩으로 처리함..근데 이게 맞는지 모르겠다