# Env

In [None]:
import os
import argparse
import collections
from datetime import datetime
import re
import json
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.auto import tqdm
from transformers import (
    T5TokenizerFast,
    AutoTokenizer
)
from tokenizers import ByteLevelBPETokenizer

In [None]:
# Gradient False
torch.set_grad_enabled(True)
# work dir
work_dir = '/Users/cchyun/Workspace/nlp_ws/nlp-practice'

In [None]:
%cd {work_dir}
!pwd

# 3.0 NMT Preprocessing & Tokenizer

In [None]:
%cd {work_dir}/src/nmt
!pwd

In [None]:
fn_list = []
for fn in os.listdir('../../data/aihub_koen'):
    if fn.endswith('.xlsx'):
        fn_list.append(f'../../data/aihub_koen/{fn}')
fn_list

In [None]:
pd.read_excel(fn_list[1])

In [None]:
# run src/nmt/preprocess.sh

In [None]:
# run src/nmt/tokenizer_train.sh

# 3.1 Language Model

In [None]:
%cd {work_dir}/src/lm
!pwd

## train rnn lm

In [None]:
# run src/lm/train_rnn.sh
!sh train_rnn.sh "cchyun-rnn-lm"

## generate rnn lm

In [None]:
# run src/lm/generate_rnn.sh
!sh generate_rnn.sh "../../checkpoints/cchyun-rnn-lm-20240321-154420.pt" "지미카터"

# 3.2 Seq to Seq

In [None]:
%cd {work_dir}/src/nmt
!pwd

## train seq2seq

In [None]:
# run src/nmt/train_seq2seq.sh
!sh train_seq2seq.sh "cchyun-rnn-nmt"

## translate seq2seq

In [None]:
# run src/nmt/translate_seq2seq.sh
!sh translate_seq2seq.sh "../../checkpoints/cchyun-rnn-nmt-20240322-022551.pt"

## infer seq2seq

In [None]:
from seq2seq import Seq2SeqTranslator

In [None]:
device = (
    torch.device("cpu")
)

model_fn = "../../checkpoints/cchyun-rnn-nmt-20240322-022551.pt"

data = torch.load(model_fn, map_location=device)
train_config = data["config"]
label2idx = data["label2idx"]
idx2label = data["idx2label"]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained(train_config.tokenizer)
tokenizer.bos_token = "<s>"

In [None]:
model = Seq2SeqTranslator(
    vocab_size=tokenizer.vocab_size,
    hidden_dim=train_config.hidden_dim,
    n_layers=train_config.n_layers,
    dropout=train_config.dropout,
    pad_idx=tokenizer.pad_token_id,
)
model.load_state_dict(data["model"])
model.eval()
model.to(device)

In [None]:
while True:
    print("input> ", end="")
    line = str(input())
    if len(line) == 0:
        break

    x = tokenizer(
        line,
        truncation=True,
        max_length=train_config.max_length,
        return_tensors="np",
    )["input_ids"]

    output_ids = model.generate(
        list(x[0]), 50, tokenizer.bos_token_id, tokenizer.eos_token_id
    )
    result = tokenizer.decode(output_ids)

    print(f"- ko: {line}\n- en: {result}\n")

## train seq2seq attention

In [None]:
# run src/nmt/train_seq2seq_attention.sh
!sh train_seq2seq_attention.sh "cchyun-attn-nmt"

## translate seq2seq attention

In [None]:
# run src/nmt/translate_seq2seq_attention.sh
!sh translate_seq2seq_attention.sh "../../checkpoints/cchyun-attn-nmt-20240322-053434.pt"

## infer seq2seq attention

In [None]:
from seq2seq_attn import Seq2SeqAttention

In [None]:
device = (
    torch.device("cpu")
)

model_fn = "../../checkpoints/cchyun-attn-nmt-20240322-053434.pt"

data = torch.load(model_fn, map_location=device)
train_config = data["config"]
label2idx = data["label2idx"]
idx2label = data["idx2label"]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained(train_config.tokenizer)
tokenizer.bos_token = "<s>"

In [None]:
model = Seq2SeqAttention(
    vocab_size=tokenizer.vocab_size,
    hidden_dim=train_config.hidden_dim,
    n_layers=train_config.n_layers,
    dropout=train_config.dropout,
    pad_idx=tokenizer.pad_token_id,
)
model.load_state_dict(data["model"])
model.eval()
model.to(device)

In [None]:
while True:
    print("input> ", end="")
    line = str(input())
    if len(line) == 0:
        break

    x = tokenizer(
        line,
        truncation=True,
        max_length=train_config.max_length,
        return_tensors="np",
    )["input_ids"]

    output_ids = model.generate(
        list(x[0]), 50, tokenizer.bos_token_id, tokenizer.eos_token_id
    )
    result = tokenizer.decode(output_ids)

    print(f"- ko: {line}\n- en: {result}\n")

# 3.3 Transformer

In [None]:
%cd {work_dir}/src/transformer
!pwd

## train t5 nmt

In [None]:
# run src/transformer/train_transformer.sh
!sh train_transformer.sh "cchyun-t5-nmt"

## translate t5 nmt

In [None]:
# src/transformer/translate_transformer.sh
!sh translate_transformer.sh "../../checkpoints/cchyun-t5-nmt-20240322-114508/checkpoint-56330"

## infer t5 nmt

In [None]:
from transformers import (
    T5TokenizerFast,
    GenerationConfig,
    T5ForConditionalGeneration,
)

In [None]:
device = (
    torch.device("cpu")
)

model_fn = "../../checkpoints/cchyun-t5-nmt-20240322-114508/checkpoint-56330"

tokenizer = T5TokenizerFast.from_pretrained(model_fn)

model = T5ForConditionalGeneration.from_pretrained(model_fn)
model = model.to(device)
model.eval()

In [None]:
generation_config = GenerationConfig(
        max_new_tokens=512,
        early_stopping=True,
        do_sample=False,
        num_beams=8,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        decoder_start_token_id=tokenizer.bos_token_id,
        repetition_penalty=1.2,
        length_penalty=1.0,
    )

In [None]:
while True:
    print("input> ", end="")
    line = str(input())
    if len(line) == 0:
        break

    x = tokenizer(
        line,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )["input_ids"].to(device)

    beam_output = model.generate(
        input_ids=x,
        generation_config=generation_config,
    )
    result = tokenizer.decode(beam_output[0], skip_special_tokens=True)

    print(f"- ko: {line}\n- en: {result}\n")