In [None]:
! pip install sacrebleu
import collections, json, pickle, time
import numpy
import tensorflow as tf

from datasets import load_metric
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
numpy.random.seed(3)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
try: import wandb
except:
    ! pip install wandb
import wandb
wandb.login(key = "280aa3837eb27ece3c32ed8e27e3e233d0afdc9c")
wandb.init("Model Keras Project Deep learning")

# Config

In [None]:
config = {
    ### Mode
    "train_mode": False,
    
    "translate_mode": False,
    "save_trans_pickle_file": "/kaggle/working/translated.pickle",
    
    "eval_mode": False,
    
    ### Thông số dataset
    "train_file": "/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/train/train.json",
    "dev_file": "/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/dev/dev.json",
    "test_file": "/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/test/test.json",
    "small_train_data": 20000, # 20000
    
    ### Thông số train
    "epoch" : 2, # 70
    "batch_size" : 128,
    "learning_rate": 0.003,
    
    ### Model ban đầu để load train tiếp hoặc infer
    "initial_model": "/kaggle/input/model-weight-and-translated-sentences-project-dl/save_model.keras",
    
    ### File save model
    "save_model": "/kaggle/working/save_model.keras"
    
}

In [None]:
print(config)

# Các hàm phụ

In [None]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# 1. Dataset

In [None]:
def load_data(json_file, train):
    en_sentences = list()
    vi_sentences = list()
    
    with open(json_file, "r") as f:
        data = json.load(f)["data"]
        
    if train and config["small_train_data"] != 0:
        data = numpy.random.choice(a = data, 
                                   size = config["small_train_data"])
    
    for sample in data:
        en_sentences.append(sample["translation"]["en"].strip().lower())
        vi_sentences.append(sample["translation"]["vi"].strip().lower())
    return en_sentences, vi_sentences

In [None]:
# Hàm tokenize:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

# Hàm padding:
def pad(x, length=None):
    return pad_sequences(x, maxlen=length, padding='post')

# Hàm preprocess: kết hợp của 2 hàm tokenize và padding, tokenize trước rồi padding
def preprocess(x, y, max_x_length = None, max_y_length = None):
    ### Tokenize
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    ### Padding
    preprocess_x = pad(preprocess_x, max_x_length)
    preprocess_y = pad(preprocess_y, max_y_length)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

# 2. Model 

In [None]:
def model_final(input_shape, output_sequence_length, english_vocab_size, vi_vocab_size):
    
    # Build the layers    
    model = Sequential()
    # Embedding
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1],
                         input_shape=input_shape[1:]))
    # Encoder
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    
    # Decoder
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(vi_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(config["learning_rate"]),
                  metrics=['accuracy'])
    return model

# 3. Hàm train và eval

In [None]:
def train(model, x, y):
    model.summary()
    model.fit(x, y, batch_size=config["batch_size"], epochs=config["epoch"], validation_split=0.2,
             callbacks=[wandb.keras.WandbCallback()]
             )
    
    return model

In [None]:
def translate_the_set(en_sentences,
                      model,
                      en_tokenizer, vi_toknizer,
                      max_en_length
                      ):
    y_id_to_word = {value: key for key, value in vi_tokenizer.word_index.items()}
    y_id_to_word[0] = ''
    translated_sentences = list()
    for en_sen in en_sentences:
        tokenized_en_sen = []
        for word in en_sen.split():
            try:
                x = en_tokenizer.word_index[word]
                tokenized_en_sen.append(x)
            except KeyError:
                continue
        pad_en_sen = pad_sequences([tokenized_en_sen], maxlen = max_en_length, padding= "post")
        translated_logits = model.predict(pad_en_sen, len(pad_en_sen))
        translated_sen = ' '.join([y_id_to_word[numpy.argmax(x)] for x in translated_logits[0]])
        translated_sentences.append(translated_sen)
    
    with open(config["save_trans_pickle_file"], "wb") as f:
        pickle.dump(translated_sentences, f)
    return translated_sentences

In [None]:
def eval_bleu(translated_sentences, # list[str]: list các câu dịch bởi model
              vi_sentences          # list[str]: list các câu label
              ):
    t1 = time.time()
    bleu_metric = load_metric("sacrebleu")

    for translated_sen, reference in zip(translated_sentences, vi_sentences):
        bleu_metric.add(prediction = translated_sen, reference = [reference])
    
    result = bleu_metric.compute()
    t2 = time.time()
    print(f"Đã tính bleu score xong!\nTime = {t2 - t1} ")
    print(f"Bleu score = {result['score']}")

    return result

# Run

In [None]:
### Tạo và xử lý data
en_sentences_train, vi_sentences_train = load_data(config["train_file"], train = True)
en_sentences_test, vi_sentences_test = load_data(config["test_file"], train = False)

In [None]:
preproc_en_sentences, preproc_vi_sentences, en_tokenizer, vi_tokenizer = preprocess(en_sentences_train, vi_sentences_train)
max_en_length, max_vi_length = preproc_en_sentences.shape[1], preproc_vi_sentences.shape[1]

In [None]:
### Tạo model
if config["initial_model"] != None:
    model = tf.keras.models.load_model(config["initial_model"])
else:
    model = model_final(preproc_en_sentences.shape, preproc_vi_sentences.shape[1],
                       len(en_tokenizer.word_index)+1, len(vi_tokenizer.word_index)+1)

In [None]:
### Train
if config["train_mode"]:
    train(model, preproc_en_sentences, preproc_vi_sentences)

In [None]:
### Translate
if config["translate_mode"]:
    translated_sentences = translate_the_set(en_sentences_test,
                      model,
                      en_tokenizer,
                      vi_tokenizer,
                      max_en_length
                      )
    result = eval_bleu(translated_sentences, vi_sentences_test)

# SAVE

In [None]:
tf.keras.models.save_model(model, config["save_model"])
print("Đã save model!")