# Env

In [None]:
import os
import argparse
import collections
from datetime import datetime
import re
import json
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.auto import tqdm
from transformers import (
    T5TokenizerFast,
    AutoTokenizer
)
from tokenizers import ByteLevelBPETokenizer

In [None]:
# Gradient False
torch.set_grad_enabled(True)
# work dir
work_dir = '/home/ubuntu/nlp-practice'

In [None]:
%cd {work_dir}
!pwd

# 3.0 NMT Preprocessing & Tokenizer

In [None]:
%cd {work_dir}/src/nmt
!pwd

In [None]:
fn_list = []
for fn in os.listdir('../../data/aihub_koen'):
    if fn.endswith('.xlsx'):
        fn_list.append(f'../../data/aihub_koen/{fn}')
fn_list

In [None]:
pd.read_excel(fn_list[1])

In [None]:
!sh ./preprocess.sh

In [None]:
!sh ./tokenizer_train.sh

# 3.1 Language Model

In [None]:
%cd {work_dir}/src/lm
!pwd

## tutorial language model

### input & embedding

In [None]:
input_text = [
    "<s>나는 학생입니다.",
    "<s>나는 학교에 가는 것을 좋아합니다."
]
label_text = [
    "나는 학생입니다.</s>",
    "나는 학교에 가는 것을 좋아합니다.</s>"
]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained('../../data/aihub_koen_32k')

In [None]:
inputs = tokenizer(input_text,
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_tensors="pt")
inputs

In [None]:
labels = tokenizer(label_text,
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_tensors="pt")
labels

### model

In [None]:
n_layers = 2
embedding_dim = 3
hidden_dim = 4
vocab_size = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id

In [None]:
embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

lstm = nn.LSTM(
    embedding_dim,
    hidden_dim,
    num_layers=n_layers,
    bidirectional=False,  # bidirectional=False for Lanugage Model
    batch_first=True,  # If False, input shape is (seq_len, batch_size, input_size).
)

# Note that we use "vocab_size" sence we are prediting vocab.
fc = nn.Linear(hidden_dim, vocab_size)

### forward

In [None]:
embed = embedding(inputs['input_ids'])
 # |embed| = (batch_size, seq_len_enc, embedding_dim)
embed

In [None]:
output, (hidden_l, cell_l) = lstm(embed)
# |output| = (batch_size, seq_len, hidden_dim)
# |hidden_l| = (n_layers, batch_size, hidden_dim)
# |cell_l| = (n_layers, batch_size, hidden_dim)
output, hidden_l, cell_l

### linear & softmax

In [None]:
hidden = output

In [None]:
logits = fc(hidden)
# |logits| = (batch_size, seq_len_dec, vocab_size)
logits.shape

In [None]:
prob = F.softmax(logits, dim=-1)
prob.shape

### loss

In [None]:
criterion = torch.nn.CrossEntropyLoss()

In [None]:
labels_id = labels['input_ids']
labels_id

In [None]:
loss = criterion(logits.view(-1, logits.size(-1)), labels_id.view(-1,))
loss

## train rnn lm

In [None]:
# run src/lm/train_rnn.sh
!sh train_rnn.sh "cchyun-rnn-lm"

## generate rnn lm

In [None]:
# run src/lm/generate_rnn.sh
!sh generate_rnn.sh "../../checkpoints/cchyun-rnn-lm-20240416-072303.pt" "지미카터"

# 3.2 Seq to Seq

In [None]:
%cd {work_dir}/src/nmt
!pwd

## tutorial seq2seq

### input & embedding

In [None]:
ko = [
    "나는 학생입니다.",
    "나는 학교에 가는 것을 좋아합니다."
]
en = [
    "<s>I am a student.",
    "<s>I love to go to school."
]
label_text = [
    "I am a student.</s>",
    "I love to go to school.</s>"
]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained('../../data/aihub_koen_32k')

In [None]:
enc_inputs = tokenizer(ko,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt")
enc_inputs

In [None]:
dec_inputs = tokenizer(en,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt")
dec_inputs

In [None]:
labels = tokenizer(label_text,
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_tensors="pt")
labels

### model

In [None]:
n_layers = 2
hidden_dim = 4
vocab_size = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id

In [None]:
embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=pad_idx)

encoder = nn.LSTM(
    hidden_dim,
    hidden_dim // 2,
    num_layers=n_layers,
    bidirectional=True,  # bidirectional=True for Encoder
    dropout=0.1,
    batch_first=True,  # If False, input shape is (seq_len, batch_size, input_size).
)

decoder = nn.LSTM(
    hidden_dim,
    hidden_dim,  # encoder bidirectional=True, decode bidirectional=False
    num_layers=n_layers,
    bidirectional=False,  # bidirectional=False for Decoder (LM)
    dropout=0.1,
    batch_first=True,  # If False, input shape is (seq_len, batch_size, input_size).
)

# Note that we use "vocab_size" sence we are prediting vocab.
fc = nn.Linear(hidden_dim, vocab_size)

### encoder

In [None]:
enc_embed = embedding(enc_inputs['input_ids'])
 # |enc_embed| = (batch_size, seq_len_enc, embedding_dim)
enc_embed

In [None]:
enc_out, (hidden_e, cell_e) = encoder(enc_embed)
# |enc_out| = (batch_size, seq_len_enc, hidden_dim)
# |hidden| = (n_layers * 2, batch_size, hidden_dim // 2)
# |cell| = (n_layers * 2, batch_size, hidden_dim // 2)
enc_out, hidden_e, cell_e

In [None]:
hidden = torch.cat((hidden_e[0::2], hidden_e[1::2]), dim=-1)
# |hidden| = (n_layers, batch_size, hidden_dim)
cell = torch.cat((cell_e[0::2], cell_e[1::2]), dim=-1)
# |hidden| = (n_layers, batch_size, hidden_dim)
hidden, cell

### decoder

In [None]:
dec_embed = embedding(dec_inputs['input_ids'])
dec_embed

In [None]:
dec_out, (hidden, cell) = decoder(dec_embed, (hidden, cell))
# |dec_out| = (batch_size, seq_len_dec, hidden_dim)
# |hidden| = (n_layers, batch_size, hidden_dim)
# |cell| = (n_layers, batch_size, hidden_dim)
dec_out, hidden_e, cell_e

### linear & softmax

In [None]:
hidden = dec_out

In [None]:
logits = fc(hidden)
# |logits| = (batch_size, seq_len_dec, vocab_size)
logits.shape

In [None]:
prob = F.softmax(logits, dim=-1)
prob.shape

### loss

In [None]:
criterion = torch.nn.CrossEntropyLoss()

In [None]:
labels_id = labels['input_ids']
labels_id

In [None]:
loss = criterion(logits.view(-1, logits.size(-1)), labels_id.view(-1,))
loss

## train seq2seq

In [None]:
# run src/nmt/train_seq2seq.sh
!sh ./train_seq2seq.sh "cchyun-rnn-nmt"

## translate seq2seq

In [None]:
# run src/nmt/translate_seq2seq.sh
!sh ./translate_seq2seq.sh "../../checkpoints/cchyun-rnn-nmt-20240322-022551.pt"

## infer seq2seq

In [None]:
from seq2seq import Seq2SeqTranslator

In [None]:
device = (
    torch.device("cpu")
)

model_fn = "../../checkpoints/cchyun-rnn-nmt-20240322-022551.pt"

data = torch.load(model_fn, map_location=device)
train_config = data["config"]
label2idx = data["label2idx"]
idx2label = data["idx2label"]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained(train_config.tokenizer)
tokenizer.bos_token = "<s>"

In [None]:
model = Seq2SeqTranslator(
    vocab_size=tokenizer.vocab_size,
    hidden_dim=train_config.hidden_dim,
    n_layers=train_config.n_layers,
    dropout=train_config.dropout,
    pad_idx=tokenizer.pad_token_id,
)
model.load_state_dict(data["model"])
model.eval()
model.to(device)

In [None]:
while True:
    print("input> ", end="")
    line = str(input())
    if len(line) == 0:
        break

    x = tokenizer(
        line,
        truncation=True,
        max_length=train_config.max_length,
        return_tensors="np",
    )["input_ids"]

    output_ids = model.generate(
        list(x[0]), 50, tokenizer.bos_token_id, tokenizer.eos_token_id
    )
    result = tokenizer.decode(output_ids)

    print(f"- ko: {line}\n- en: {result}\n")

# 3.3. Seq2Seq + Attention

In [None]:
%cd {work_dir}/src/nmt
!pwd

## tutorial attention

### input & labels

In [None]:
ko = [
    "나는 학생입니다.",
    "나는 학교에 가는 것을 좋아합니다."
]
en = [
    "<s>I am a student.",
    "<s>I love to go to school."
]
label_text = [
    "I am a student.</s>",
    "I love to go to school.</s>"
]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained('../../data/aihub_koen_32k')

In [None]:
enc_inputs = tokenizer(ko,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt")
enc_inputs

In [None]:
dec_inputs = tokenizer(en,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt")
dec_inputs

In [None]:
labels = tokenizer(label_text,
                    padding=True,
                    truncation=True,
                    max_length=128,
                    return_tensors="pt")
labels

### model

In [None]:
n_layers = 2
hidden_dim = 4
vocab_size = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id

In [None]:
embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=pad_idx)

encoder = nn.LSTM(
    hidden_dim,
    hidden_dim // 2,
    num_layers=n_layers,
    bidirectional=True,  # bidirectional=True for Encoder
    dropout=0.1,
    batch_first=True,  # If False, input shape is (seq_len, batch_size, input_size).
)

decoder = nn.LSTM(
    hidden_dim,
    hidden_dim,  # encoder bidirectional=True, decode bidirectional=False
    num_layers=n_layers,
    bidirectional=False,  # bidirectional=False for Decoder (LM)
    dropout=0.1,
    batch_first=True,  # If False, input shape is (seq_len, batch_size, input_size).
)

attn_w = nn.Linear(hidden_dim, hidden_dim)
concat_w = nn.Linear(hidden_dim * 2, hidden_dim)

# Note that we use "vocab_size" sence we are prediting vocab.
fc = nn.Linear(hidden_dim, vocab_size)

### encoder

In [None]:
enc_embed = embedding(enc_inputs['input_ids'])
 # |enc_embed| = (batch_size, seq_len_enc, embedding_dim)
enc_embed

In [None]:
enc_out, (hidden_e, cell_e) = encoder(enc_embed)
# |enc_out| = (batch_size, seq_len_enc, hidden_dim)
# |hidden| = (n_layers * 2, batch_size, hidden_dim // 2)
# |cell| = (n_layers * 2, batch_size, hidden_dim // 2)
enc_out, hidden_e, cell_e

In [None]:
hidden = torch.cat((hidden_e[0::2], hidden_e[1::2]), dim=-1)
# |hidden| = (n_layers, batch_size, hidden_dim)
cell = torch.cat((cell_e[0::2], cell_e[1::2]), dim=-1)
# |hidden| = (n_layers, batch_size, hidden_dim)
hidden, cell

### decoder

In [None]:
dec_embed = embedding(dec_inputs['input_ids'])
dec_embed

In [None]:
dec_out, (hidden, cell) = decoder(dec_embed, (hidden, cell))
# |dec_out| = (batch_size, seq_len_dec, hidden_dim)
# |hidden| = (n_layers, batch_size, hidden_dim)
# |cell| = (n_layers, batch_size, hidden_dim)
dec_out, hidden_e, cell_e

### attention

In [None]:
Q = dec_out
K = enc_out
V = enc_out
attention_mask = enc_inputs['attention_mask']
# |Q| = (batch_size, Q_len, hidden_dim)
# |K| = (batch_size, K_len, hidden_dim)
# |V| = (batch_size, K_len, hidden_dim)
# |attention_mask| = (batch_size, K_len)

In [None]:
Q = attn_w(Q)
# |Q| = (batch_size, Q_len, hidden_dim)
Q

In [None]:
attn_score = torch.matmul(Q, K.transpose(-2, -1).contiguous())
# |attn_score| = (batch_size, Q_len, K_len)
attn_score

In [None]:
attention_mask = attention_mask.unsqueeze(1)
# |attention_mask| = (batch_size, 1, K_len)
attention_mask

In [None]:
attn_score -= (1 - attention_mask) * 1e9
# |attn_score| = (batch_size, Q_len, K_len)
attn_score

In [None]:
attn_prob = F.softmax(attn_score, dim=-1)
# |attn_prob| = (batch_size, Q_len, K_len)
attn_prob

In [None]:
attn_out = torch.matmul(attn_prob, V)
# |attn_out| = (batch_size, Q_len, hidden_dim)
attn_out

In [None]:
hidden = torch.cat([Q, attn_out], dim=-1)
# |hidden| = (batch_size, Q_len, hidden_dim * 2)
hidden = concat_w(hidden)
hidden = F.tanh(hidden)
# |hidden| = (batch_size, Q_len, hidden_dim)

### linear & softmax

In [None]:
logits = fc(hidden)
# |logits| = (batch_size, seq_len_dec, vocab_size)
logits.shape

In [None]:
prob = F.softmax(logits, dim=-1)
prob.shape

### loss

In [None]:
criterion = torch.nn.CrossEntropyLoss()

In [None]:
labels_id = labels['input_ids']
labels_id

In [None]:
loss = criterion(logits.view(-1, logits.size(-1)), labels_id.view(-1,))
loss

## train seq2seq attention

In [None]:
# run src/nmt/train_seq2seq_attention.sh
!sh train_seq2seq_attention.sh "cchyun-attn-nmt"

## translate seq2seq attention

In [None]:
# run src/nmt/translate_seq2seq_attention.sh
!sh translate_seq2seq_attention.sh "../../checkpoints/cchyun-attn-nmt-20240328-050402.pt"

## infer seq2seq attention

In [None]:
from seq2seq_attn import Seq2SeqAttention

In [None]:
device = (
    torch.device("cpu")
)

model_fn = "../../checkpoints/cchyun-attn-nmt-20240328-050402.pt"

data = torch.load(model_fn, map_location=device)
train_config = data["config"]
label2idx = data["label2idx"]
idx2label = data["idx2label"]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained(train_config.tokenizer)
tokenizer.bos_token = "<s>"

In [None]:
model = Seq2SeqAttention(
    vocab_size=tokenizer.vocab_size,
    hidden_dim=train_config.hidden_dim,
    n_layers=train_config.n_layers,
    dropout=train_config.dropout,
    pad_idx=tokenizer.pad_token_id,
)
model.load_state_dict(data["model"])
model.eval()
model.to(device)

In [None]:
while True:
    print("input> ", end="")
    line = str(input())
    if len(line) == 0:
        break

    x = tokenizer(
        line,
        truncation=True,
        max_length=train_config.max_length,
        return_tensors="np",
    )["input_ids"]

    output_ids = model.generate(
        list(x[0]), 50, tokenizer.bos_token_id, tokenizer.eos_token_id
    )
    result = tokenizer.decode(output_ids)

    print(f"- ko: {line}\n- en: {result}\n")

# 3.4. Transformer

In [None]:
%cd {work_dir}/src/transformer
!pwd

## tutorial trnasformer

### input & embedding

In [None]:
ko = [
    "나는 학생입니다.",
    "나는 학교에 가는 것을 좋아합니다."
]
en = [
    "<s>I am a student.",
    "<s>I love to go to school."
]
label = [
    "I am a student.<s/>",
    "I love to go to school.<s/>"
]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained('../../data/aihub_koen_32k')

In [None]:
enc_inputs = tokenizer(ko,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt")
enc_inputs

In [None]:
dec_inputs = tokenizer(en,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt")
dec_inputs

In [None]:
enc_mask = enc_inputs['attention_mask'].unsqueeze(1)
enc_mask

In [None]:
dec_len = dec_inputs['input_ids'].shape[1]
dec_mask = torch.ones(dec_len, dec_len)
dec_mask = 1 - dec_mask.triu(diagonal=1)
dec_mask = dec_mask.unsqueeze(0)
dec_mask

In [None]:
hidden_dim = 4
vocab_size = tokenizer.vocab_size
pad_idx = tokenizer.pad_token_id

In [None]:
embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=pad_idx)
enc_hidden = embedding(enc_inputs['input_ids'])
dec_hidden = embedding(dec_inputs['input_ids'])
enc_hidden, dec_hidden

### scale-dot product attention

In [None]:
def scale_dot_product_attention(Q, K, V, attention_mask):
    # |Q| = (batch_size, n_head, Q_len, hidden_dim)
    # |K| = (batch_size, n_head, K_len, hidden_dim)
    # |V| = (batch_size, n_head, K_len, hidden_dim)
    # |attention_mask| = (batch_size, 1, 1 or K_len, K_len)

    # d_k
    d_k = torch.tensor(K.shape[-1], dtype=K.dtype, device=K.device)
    scale = torch.sqrt(d_k) # scalar
    # |Q| = (batch_size, n_head, Q_len, hidden_dim)
    attn_score = torch.matmul(Q, K.transpose(-2, -1).contiguous())
    attn_score = attn_score.div(scale)
    attn_score -= (1 - attention_mask) * 1e9
    # |attn_score| = (batch_size, n_head, Q_len, K_len)
    attn_prob = F.softmax(attn_score, dim=-1)
    print(attn_prob)
    # |attn_prob| = (batch_size, n_head, Q_len, K_len)
    attn_out = torch.matmul(attn_prob, V)
    # |attn_out| = (batch_size, n_head, Q_len, hidden_dim)
    return attn_out

In [None]:
# encoder self attention
Q = enc_hidden
K = enc_hidden
V = enc_hidden
attention_mask = enc_mask

scale_dot_product_attention(Q, K, V, attention_mask)

In [None]:
# decoder self attention
Q = dec_hidden
K = dec_hidden
V = dec_hidden
attention_mask = dec_mask

scale_dot_product_attention(Q, K, V, attention_mask)

In [None]:
# cross attention
Q = dec_hidden
K = enc_hidden
V = enc_hidden
attention_mask = enc_mask

scale_dot_product_attention(Q, K, V, attention_mask)

### multi head attention

In [None]:
n_head = 2
d_head = hidden_dim // n_head

In [None]:
W_Q = torch.nn.Linear(hidden_dim, n_head * d_head)
W_K = torch.nn.Linear(hidden_dim, n_head * d_head)
W_V = torch.nn.Linear(hidden_dim, n_head * d_head)
W_O = torch.nn.Linear(n_head * d_head, hidden_dim)

In [None]:
def multi_head_product_attention(Q, K, V, attention_mask):
    # |Q| = (batch_size, Q_len, hidden_dim)
    # |K| = (batch_size, K_len, hidden_dim)
    # |V| = (batch_size, K_len, hidden_dim)
    # |attention_mask| = (batch_size, 1 or Q_len, K_len)

    Q_m = W_Q(Q).view(-1, Q.size(1), n_head, d_head).transpose(1, 2).contiguous()
    K_m = W_K(K).view(-1, K.size(1), n_head, d_head).transpose(1, 2).contiguous()
    V_m = W_V(V).view(-1, V.size(1), n_head, d_head).transpose(1, 2).contiguous()
    # |Q_m| = (batch_size, n_head, Q_len, d_head)
    # |K_m| = (batch_size, n_head, K_len, d_head)
    # |V_m| = (batch_size, n_head, K_len, d_head)

    attention_mask_m = attention_mask.unsqueeze(1)
    # |attention_mask| = (batch_size, 1, 1 or Q_len, K_len)

    attn_out_m = scale_dot_product_attention(Q_m, K_m, V_m, attention_mask_m)
    # |attn_out_m| = (batch_size, n_head, Q_len, d_head)

    attn_out_c = attn_out_m.transpose(1, 2).contiguous().view(-1, Q.size(1), n_head * d_head)
    # |attn_out_c| = (batch_size, Q_len, n_head * d_head)

    attn_out = W_O(attn_out_c)
    # |attn_out_c| = (batch_size, Q_len, hidden_dim)

    return attn_out

In [None]:
# encoder self attention
Q = enc_hidden
K = enc_hidden
V = enc_hidden
attention_mask = enc_mask

multi_head_product_attention(Q, K, V, attention_mask)

In [None]:
# decoder self attention
Q = dec_hidden
K = dec_hidden
V = dec_hidden
attention_mask = dec_mask

multi_head_product_attention(Q, K, V, attention_mask)

In [None]:
# cross attention
Q = dec_hidden
K = enc_hidden
V = enc_hidden
attention_mask = enc_mask

scale_dot_product_attention(Q, K, V, attention_mask)

## train t5 nmt

In [None]:
# run src/transformer/train_transformer.sh
!sh train_transformer.sh "cchyun-t5-nmt"

## translate t5 nmt

In [None]:
# src/transformer/translate_transformer.sh
!sh translate_transformer.sh "../../checkpoints/cchyun-t5-nmt-20240322-114508/checkpoint-56330"

## infer t5 nmt

In [None]:
from transformers import (
    T5TokenizerFast,
    GenerationConfig,
    T5ForConditionalGeneration,
)

In [None]:
device = (
    torch.device("cpu")
)

model_fn = "../../checkpoints/cchyun-t5-nmt-20240322-114508/checkpoint-56330"

tokenizer = T5TokenizerFast.from_pretrained(model_fn)

model = T5ForConditionalGeneration.from_pretrained(model_fn)
model = model.to(device)
model.eval()

In [None]:
generation_config = GenerationConfig(
        max_new_tokens=512,
        early_stopping=True,
        do_sample=False,
        num_beams=8,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        decoder_start_token_id=tokenizer.bos_token_id,
        repetition_penalty=1.2,
        length_penalty=1.0,
    )

In [None]:
while True:
    print("input> ", end="")
    line = str(input())
    if len(line) == 0:
        break

    x = tokenizer(
        line,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )["input_ids"].to(device)

    beam_output = model.generate(
        input_ids=x,
        generation_config=generation_config,
    )
    result = tokenizer.decode(beam_output[0], skip_special_tokens=True)

    print(f"- ko: {line}\n- en: {result}\n")