# Library

In [1]:
! pip install Korpora sentencepiece



In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from Korpora import Korpora
import pandas as pd
from pprint import pprint
# from konlpy.tag import Mecab
from nltk.tokenize import word_tokenize as en_tokenizer
import sentencepiece as spm
import urllib.request
import csv
import numpy as np
from einops import rearrange, reduce



In [11]:
VOCAB_SIZE = 32000 + 7
SEQ_LEN = 200

# Load Data

In [4]:
# dataset = open_subtitles_dataset()
corpus = Korpora.load("open_subtitles", root_dir='./')



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : TRAC (https://trac.edgewall.org/)
    Repository : http://opus.nlpl.eu/OpenSubtitles-v2018.php
    References :
        - P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora
          from Movie and TV Subtitles. In Proceedings of the 10th International Conference on
          Language Resources and Evaluation (LREC 2016)

    This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.

    [[ IMPORTANT ]]
    If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/
    to your website and to your reports and publications produced with the data!
    I promised this when I got the data from the providers of that website!

 

In [5]:
data = pd.DataFrame([corpus.train.pairs, corpus.train.texts], index = ['src', 'trg'])
data = data.transpose()
data.to_csv('data.txt', index=False)
data.head()

Unnamed: 0,src,trg
0,"Through the snow and sleet and hail, through t...","폭설이 내리고 우박, 진눈깨비가 퍼부어도 눈보라가 몰아쳐도 강풍이 불고 비바람이 휘..."
1,"ever faithful, ever true, nothing stops him, h...",우리의 한결같은 심부름꾼 황새 아저씨 가는 길을 그 누가 막으랴!
2,Look out for Mr Stork That persevering chap,황새 아저씨를 기다리세요
3,He'll come along and drop a bundle in your lap,찾아와 선물을 주실 거예요
4,You may be poor or rich It doesn't matter which,가난하든 부자이든 상관이 없답니다


In [6]:
data = pd.read_csv('data.txt')
data.head()

Unnamed: 0,src,trg
0,"Through the snow and sleet and hail, through t...","폭설이 내리고 우박, 진눈깨비가 퍼부어도 눈보라가 몰아쳐도 강풍이 불고 비바람이 휘..."
1,"ever faithful, ever true, nothing stops him, h...",우리의 한결같은 심부름꾼 황새 아저씨 가는 길을 그 누가 막으랴!
2,Look out for Mr Stork That persevering chap,황새 아저씨를 기다리세요
3,He'll come along and drop a bundle in your lap,찾아와 선물을 주실 거예요
4,You may be poor or rich It doesn't matter which,가난하든 부자이든 상관이 없답니다


## Sentencepiece Train

In [7]:
with open('src.txt', mode = 'w', encoding='utf8') as f:
    f.write('\n'.join(data['src']))
with open('trg.txt', mode= 'w', encoding='utf8') as f:
    f.write('\n'.join(data['trg']))

In [9]:
# corpus = "src.txt"
# prefix = "src"
# vocab_size = 32000
# spm.SentencePieceTrainer.train(
#     f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" +
#     " --model_type=bpe" +
#     " --max_sentence_length=999999" +  # 문장 최대 길이
#     " --pad_id=0 --pad_piece=[PAD]" +  # pad (0)
#     " --unk_id=1 --unk_piece=[UNK]" +  # unknown (1)
#     " --bos_id=2 --bos_piece=[BOS]" +  # begin of sequence (2)
#     " --eos_id=3 --eos_piece=[EOS]" +  # end of sequence (3)
#     " --user_defined_symbols=[SEP],[CLS],[MASK]")  # 사용자 정의 토큰


In [10]:
sp_src = spm.SentencePieceProcessor()
sp_src.Load('src.model')
lines = [
    "I didn't at all think of it this way.",
    "I have waited a long time for someone to film",
    "[PAD] [CLS] [BOS] [EOS] [SEP] [UNK] "
]
for line in lines:
    print(sp_src.EncodeAsPieces(line))
    print(sp_src.EncodeAsIds(line))


['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[14, 346, 31954, 31935, 178, 163, 232, 64, 58, 115, 343, 31944]
['▁I', '▁have', '▁waited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[14, 114, 5073, 10, 519, 280, 103, 731, 30, 3650]
['▁[', 'P', 'AD', ']', '▁', '[CLS]', '▁[', 'B', 'OS', ']', '▁[', 'E', 'OS', ']', '▁', '[SEP]', '▁[', 'UN', 'K', ']']
[361, 31980, 3429, 31992, 31933, 5, 361, 31974, 3377, 31992, 361, 31978, 3377, 31992, 31933, 4, 361, 2774, 31987, 31992]


In [12]:
# corpus = "trg.txt"
# prefix = "trg"
# vocab_size = 32000
# spm.SentencePieceTrainer.train(
#     f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" +
#     " --model_type=bpe" +
#     " --max_sentence_length=999999" +  # 문장 최대 길이
#     " --pad_id=0 --pad_piece=[PAD]" +  # pad (0)
#     " --unk_id=1 --unk_piece=[UNK]" +  # unknown (1)
#     " --bos_id=2 --bos_piece=[BOS]" +  # begin of sequence (2)
#     " --eos_id=3 --eos_piece=[EOS]" +  # end of sequence (3)
#     " --user_defined_symbols=[SEP],[CLS],[MASK]")  # 사용자 정의 토큰


In [13]:
sp_trg = spm.SentencePieceProcessor()
sp_trg.Load('trg.model')
lines = [
    "알잘딱깔센 임마 그거 몰라?.",
    "하.. 존나 싫다...",
    "가족이 사람을 죽여서 면목이 없다거나 같이 살던 사람들이 살해당해서 책임을 느낀다거나 자신의 이런 저런 일로",
    "[PAD] [CLS] [BOS] [EOS] [SEP] [UNK] "
]
for line in lines:
    print(sp_trg.EncodeAsPieces(line))
    print(sp_trg.EncodeAsIds(line))


['▁알', '잘', '딱', '깔', '센', '▁임마', '▁그거', '▁몰라', '?', '.']
[28, 30693, 31173, 31412, 31227, 5378, 635, 467, 30557, 30547]
['▁하', '..', '▁존나', '▁싫다', '...']
[14, 15, 5522, 9095, 27]
['▁가족이', '▁사람을', '▁죽여서', '▁면', '목이', '▁없다', '거나', '▁같이', '▁살던', '▁사람들이', '▁살해당', '해서', '▁책임을', '▁느낀', '다거나', '▁자신의', '▁이런', '▁저런', '▁일로']
[2957, 970, 17264, 1568, 8939, 1288, 792, 364, 12102, 671, 8318, 200, 7016, 8210, 23194, 2029, 275, 2758, 3525]
['▁[', 'P', 'A', 'D', ']', '▁', '[CLS]', '▁[', 'B', 'O', 'S', ']', '▁[', 'EO', 'S', ']', '▁', '[SEP]', '▁[', 'UN', 'K', ']']
[699, 31049, 30846, 30963, 31062, 30545, 5, 699, 30955, 30947, 30856, 31062, 699, 6999, 30856, 31062, 30545, 4, 699, 17323, 31214, 31062]


## SRC Data (EN) Preprocessing

In [23]:
def en_encode(tmpstr:str) -> np.array :
    tmpstr = np.array(sp_src.EncodeAsIds(tmpstr))

    # SEQ_LEN보다 길면 짜른다 
    if len(tmpstr) > SEQ_LEN :
        tmpstr = tmpstr[:SEQ_LEN]

    # SEQ_LEN보다 작으면 padding
    else :
        tmpstr = np.pad(tmpstr, (0, SEQ_LEN - len(tmpstr)), 'constant', constant_values = sp_src.pad_id())
    
    return tmpstr

In [33]:
# src_data는 data['src']를 참조한다. (동일 id)
src_data = data['src']

src_list = []
for item in src_data:
    src_list.append(en_encode(item))

src_list[:10]

[array([ 6823,    20,  4819,    80,  3222,    51,    80, 16146, 31952,
          649,    20, 31760,   301, 31952,   649,    20,  3867,    56,
        31952,   649,    20,  1475,    80,   649,    20,  3414, 31952,
          372,  3085, 31952,   372,  7886, 31952,   649,    20, 25790,
         6930,  6787, 31952,    80,    20, 10726, 10477,  3997, 31952,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [25]:
src_data = torch.tensor(src_list)
src_data.shape

  src_data = torch.tensor(src_list)


torch.Size([1269683, 200])

## TRG Data (KO) Preprocessing

In [43]:
def ko_encode(tmpstr: str) -> np.array:
    tmpstr = np.array(sp_trg.EncodeAsIds(tmpstr))
    tmpstr = np.insert(tmpstr, 0, sp_trg.bos_id())

    if len(tmpstr) >= SEQ_LEN:
        # SEQ_LEN -1의 길이로 자른다
        tmpstr = tmpstr[:SEQ_LEN-1]
        # 마지막에 <eos> 토큰을 넣어줌으로써, 길이를 SEQ_LEN으로 맞춘다
        tmpstr = np.pad(tmpstr, (0, 1),
                        'constant', constant_values=sp_trg.eos_id())


    else:
        tmpstr = np.pad(tmpstr, (0, 1),
                        'constant', constant_values=sp_trg.eos_id())
        tmpstr = np.pad(tmpstr, (0, SEQ_LEN - len(tmpstr)),
                        'constant', constant_values=sp_trg.pad_id())

    return tmpstr


In [44]:
# trg_data는 data['trg']를 참조한다. (동일 id)
trg_data = data['trg']

trg_list = []
for item in trg_data:
    trg_list.append(ko_encode(item))

trg_list[:10]


[array([    2,   721, 30905, 30546, 11101,    24, 30900, 30558,   130,
        30970, 31043,  2579, 28212,  1207,   490, 30600,  2024,  5312,
        29426,   548, 15377, 23537,   168, 11300, 30546,  3611, 30841,
        30551, 29426,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [45]:
trg_data = torch.tensor(trg_list)
trg_data.shape

torch.Size([1269683, 200])

# Transformer

## Mask Function

In [None]:
'''
Mask 행렬을 반환하는 Mask Function

Input
- Tensor
    shape (bs, seq_len)

Args
- Option
    If option is 'padding', function returns padding mask
    If option is 'lookahead', function returns lookahead mask

Output
- Tensor
    shpae (bs, seq_len, seq_len)

'''


def makeMask(tensor, option: str) -> torch.Tensor:

    '''
    tensor (bs, seq_len)
    '''
    if option == 'padding':
        


    elif option == 'lookahead':

    return output


## Multihead Attention

In [13]:
class Multiheadattention(nn.Module):
    def __init__(self, hidden_dim: int, num_head: int):
        super().__init__()

        # embedding_dim, d_model, 512 in paper
        self.hidden_dim = hidden_dim
        # 8 in paper
        self.num_head = num_head
        # head_dim, d_key, d_query, d_value, 64 in paper (= 512 / 8)
        self.head_dim = hidden_dim // num_head
        self.scale = torch.sqrt(torch.FloatTensor())

        self.fcQ = nn.Linear(hidden_dim, hidden_dim)
        self.fcK = nn.Linear(hidden_dim, hidden_dim)
        self.fcV = nn.Linear(hidden_dim, hidden_dim)
        self.fcOut = nn.Linear(hidden_dim, hidden_dim)



    def forward(self, srcQ, srcK, srcV, mask=None):

        ##### SCALED DOT PRODUCT ATTENTION ######

        # input : (bs, seq_len, hidden_dim)
        Q = self.fcQ(srcQ)
        K = self.fcK(srcK)
        V = self.fcV(srcV)

        Q = rearrange(
            Q, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        K_T = rearrange(
            K, 'bs seq_len (num_head head_dim) -> bs num_head head_dim seq_len', num_head=self.num_head)
        V = rearrange(
            V, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)

        attention_energy = torch.matmul(Q, K_T)
        # attention_energy : (bs, num_head, seq_len, seq_len)

        if mask is not None :
            attention_energy : torch.masked_fill(attention_energy, (mask==0), -1e10)

        attention_energy = torch.softmax(attention_energy, dim = -1)
        print(attention_energy[0,0,0,:])

        result = torch.matmul(attention_energy,V)
        # result (bs, num_head, seq_len, head_dim)

        ##### END OF SCALED DOT PRODUCT ATTENTION ######

        # CONCAT
        result = rearrange(result, 'bs num_head seq_len head_dim -> bs seq_len (num_head head_dim)')
        # result : (bs, seq_len, hidden_dim)

        # LINEAR

        result = self.fcOut(result)

        return result
        


In [None]:
# # TEST CODE #
# bs = 32
# seq_len = 200
# hidden_dim = 128
# test_tensor = torch.randn((bs,seq_len,hidden_dim))
# print(test_tensor.shape)
# test_layer = Multiheadattention(hidden_dim=hidden_dim, num_head =2)
# print(test_layer(srcQ = test_tensor, srcK = test_tensor, srcV = test_tensor).shape)

## Poistionwise Feedforward Network

In [None]:
class FFN(nn.Module):
    def __init__ (self, hidden_dim, inner_dim):
        super().__init__()

        # 512 in paper 
        self.hidden_dim = hidden_dim
        # 2048 in paper
        self.inner_dim = inner_dim 

        self.fc1 = nn.Linear(hidden_dim, inner_dim)
        self.fc2 = nn.Linear(inner_dim, hidden_dim)
        self.relu = nn.ReLU()

        
    def forward(self, input):
        output = self.fc1(input)
        output = nn.ReLU(output)
        output = self.fc2(output)

        return output


## Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim
        
        self.multiheadattention = Multiheadattention(hidden_dim, num_head)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm = nn.LayerNorm()


    def forward(self, input, mask = None):

        # input : (bs, seq_len, hidden_dim)
        
        # encoder attention
        # uses only padding mask
        output = self.multiheadattention(srcQ= input, srcK = input, srcV = input, mask = mask)
        output = input + output
        output = self.layerNorm(output)

        output_ = self.ffn(output)
        output = output + output_
        output = self.layerNorm(output)

        # output : (bs, seq_len, hidden_dim)
        return output



## Encoder Architecture

In [None]:
class Encoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)

        self.enc_layers = []
        for i in range(N):
            self.enc_layers.append(EncoderLayer(hidden_dim, num_head, inner_dim))

    def forward(self, input):

        # input : (bs, seq_len)

        mask = makeMask(input, option='padding')

        # embedding layer
        output = self.embedding(input)
        # output : (bs, seq_len, hidden_dim)

        # Positional Embedding
        output = pos_embed(input)


        # N encoder layer
        for layer in self.enc_layers:
            output = layer(output, mask)

        # output : (bs, seq_len, hidden_dim)

        return output


## Decoder Layer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.multiheadattention1 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm1 = nn.LayerNorm()
        self.multiheadattention2 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm2 = nn.LayerNorm()
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm3 = nn.LayerNorm()

    
    def forward(self, input, enc_output, paddingMask, lookaheadMask):
        # input : (bs, seq_len, hidden_dim)

        # first multiheadattention
        output = self.multiheadattention1(input, input, input, lookaheadMask)
        output += input
        output = self.layerNorm1(output)

        # second multiheadattention
        output_ = self.multiheadattention2(output, enc_output, enc_output, paddingMask)
        output += output_
        output = self.layerNorm2(output)

        # Feedforward Network
        output_ = self.ffn(output)
        output += output_
        output = self.layerNorm3(output)

        return output


## Decoder Architecture

In [None]:
class Decoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)

        self.dec_layers = []
        for i in range(N):
            self.dec_layers.append(DecoderLayer(hidden_dim, num_head, inner_dim))

        self.finalFc = nn.Linear(hidden_dim, VOCAB_SIZE)

    def forward(self, input, enc_src, enc_output):

        # input : (bs, seq_len)
        # enc_src : (bs, seq_len)
        # enc_output : (bs, seq_len,hidden_dim)

        lookaheadMask = makeMask(input, option='lookhaead')
        paddingMask = makeMask(enc_src, option = 'padding')

        # embedding layer
        output = self.embedding(input)

        # Positional Embedding
        output = pos_embed(input)

        # N decoder layer
        for dec_layer in self.dec_layers:
            output = dec_layer(output, enc_output, paddingMask, lookaheadMask)
        
        # output : (bs, seq_len, hidden_dim)

        output = self.finalFc(output)
        output = torch.softmax(output, dim = -1)
        # output : (bs, seq_len, VOCAB_SIZE)



        return output


## Transformer Model

In [None]:
class Transformer(nn.Module):
    def __init__(self, N = 6, hidden_dim = 512, num_head = 8, inner_dim = 2048):
        self.encoder = Encoder(N, hidden_dim, num_head, inner_dim)
        self.decoder = Decoder(N, hidden_dim, num_head, inner_dim)

    def forward(self, enc_src, dec_src):


# Model Train

# Inference