# IMPORT

In [None]:
import json
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import  AutoTokenizer, AutoModel
from tokenizers import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, MultiHeadAttention, LayerNormalization, Add, Dense, Dropout

import tensorflow as tf
import numpy as np
import random
nltk.download('punkt')

# DATA PROCESS

## Tokenizer

In [None]:
vi_sentences_path = "/kaggle/input/berttokenize/Bert/tokenize_vi.txt" # change at will
en_sentences_path = "/kaggle/input/berttokenize/Bert/tokenize_en.txt" # change at will
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_vi = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [None]:
vi_vocab_size = tokenizer_vi.vocab_size
en_vocab_size = tokenizer_en.vocab_size

print(f"Vietnamese Vocabulary Size: {vi_vocab_size}")
print(f"English Vocabulary Size: {en_vocab_size}")

In [None]:
input_text = "This is an English sentence"

input_ids = tokenizer_en.encode(input_text, return_tensors="pt")

print(input_ids)

decoded_text = tokenizer_en.decode(input_ids[0], skip_special_tokens=True)

print("Decoded Text:", decoded_text)

In [None]:
input_text = "Với bài toán dịch Anh - Việt, việc kiểm tra cách mà tokenizer mã hóa câu tiếng Anh và tái mã hóa lại câu tiếng Việt là rất quan trọng. Dưới đây là hướng dẫn cụ thể"

# Tokenize câu
input_ids = tokenizer_vi.encode(input_text)

print(input_ids)

decoded_text = tokenizer_vi.decode(input_ids, skip_special_tokens=True)

print("Decoded Text:", decoded_text)

## Split data

In [None]:
def count_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return len(lines)

# Example: Count the number of sentences in the tokenized_vi.txt file

num_sentences_vi = count_sentences(vi_sentences_path)
num_sentences_en = count_sentences(en_sentences_path)

print(f"Number of sentences in tokenized vietnamese: {num_sentences_vi}")
print(f"Number of sentences in tokenized english: {num_sentences_en}")

In [None]:
# Retrieve the vocabulary for both tokenizers
vi_vocab = tokenizer_vi.get_vocab()  
en_vocab = tokenizer_en.get_vocab()  

# Print the first 20 tokens from the English vocabulary
print("First 20 tokens in the English vocabulary:")
for i, (token, _) in enumerate(list(en_vocab.items())[:20]):
    print(f"{i+1}. {token}")

# Print the first 20 tokens from the Vietnamese vocabulary
print("\nFirst 20 tokens in the Vietnamese vocabulary:")
for i, (token, _) in enumerate(list(vi_vocab.items())[:20]):
    print(f"{i+1}. {token}")


In [None]:
# Read tokenized sentences
def read_tokenized_sentences(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    return [list(map(int, line.strip().split())) for line in lines]

# Load tokenized sentences
tokenized_en = read_tokenized_sentences(en_sentences_path)
tokenized_vi = read_tokenized_sentences(vi_sentences_path)

In [None]:
# Take only 1% of the data
def sample_data(english_sentences, vietnamese_sentences, sample_ratio=0.02):
    dataset_size = len(english_sentences)
    sample_size = int(sample_ratio * dataset_size)
    indices = np.random.choice(dataset_size, sample_size, replace=False)

    sampled_en = [english_sentences[i] for i in indices]
    sampled_vi = [vietnamese_sentences[i] for i in indices]

    return sampled_en, sampled_vi

sampled_en, sampled_vi = sample_data(tokenized_en, tokenized_vi,sample_ratio=0.02)

## Create Dataset

In [None]:
def create_tf_dataset(english_sentences, vietnamese_sentences, train_split=0.9):
    dataset_size = len(english_sentences)
    indices = np.arange(dataset_size)
    np.random.shuffle(indices)

    train_size = int(train_split * dataset_size)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]

    def select_data(indices):
        en_data = tf.constant([english_sentences[i] for i in indices], dtype=tf.int32)
        vi_data = tf.constant([vietnamese_sentences[i] for i in indices], dtype=tf.int32)
        return tf.data.Dataset.from_tensor_slices((en_data, vi_data))

    train_data = select_data(train_indices)
    val_data = select_data(val_indices)

    return train_data, val_data
    
train_data, val_data = create_tf_dataset(sampled_en, sampled_vi, train_split=0.9)

In [None]:
def prepare_data_dynamic_parallel(dataset):
    def map_func(english, vietnamese):
        # Split Vietnamese target into input (targ_in) and output (targ_out)
        targ_in = vietnamese[:, :-1]
        targ_out = vietnamese[:, 1:]
        return (tf.cast(english, tf.int64), tf.cast(targ_in, tf.int64)), tf.cast(targ_out, tf.int64)

    return (
        dataset.shuffle(10000)
        .batch(BATCH_SIZE, drop_remainder=False)
        .map(map_func, num_parallel_calls=tf.data.AUTOTUNE)  # Parallel mapping
    )

In [None]:
# Prepare the datasets
BATCH_SIZE = 32
train_dataset = prepare_data_dynamic_parallel(train_data)
val_dataset = prepare_data_dynamic_parallel(val_data)

In [None]:
# Example of how the shapes should look now
for (en_batch, targ_in), targ_out in train_dataset.take(1):
    print("English Batch Shape:", en_batch.shape)
    print("Vietnamese Input Batch Shape:", targ_in.shape)
    print("Vietnamese Output Batch Shape:", targ_out.shape)

In [None]:
# Check the dtype
train_dataset

# MODEL

In [None]:
# import wandb

In [None]:
# wandb.login()

In [None]:
# wandb.init(project="translation-model", name="en-to-vi-translation")

In [None]:
# Import
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, MultiHeadAttention, LayerNormalization, Add, Dense, Dropout


In [None]:
# Model parameters
VOCAB_SIZE = 64000  
UNITS = 256
MAX_LENGTH = 50

In [None]:
# ENCODER LAYER

encoder_input = tf.keras.Input(shape=(None,), dtype=tf.int64)
embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=UNITS, mask_zero=False)(encoder_input)
encoder_embedding_dropout = Dropout(0.2)(embedding) 
rnn_output = Bidirectional(LSTM(units=UNITS, return_sequences=True))(encoder_embedding_dropout)
encoder_output = Dense(UNITS)(rnn_output)  

# CROSS-ATTENTION LAYER

decoder_input = tf.keras.Input(shape=(None,), dtype=tf.int64)
decoder_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=UNITS, mask_zero=False)(decoder_input)
decoder_embedding_dropout = Dropout(0.2)(decoder_embedding)
pre_attention_rnn = LSTM(units=UNITS, return_sequences=True, return_state=True)(decoder_embedding_dropout)
attn_output = MultiHeadAttention(key_dim=UNITS, num_heads=4)(query=pre_attention_rnn[0], value=encoder_output)
attn_output = Add()([pre_attention_rnn[0], attn_output])
attn_output = LayerNormalization()(attn_output)

# DECODER LAYER
post_attention_rnn = LSTM(units=UNITS, return_sequences=True)(attn_output)
logits = Dense(VOCAB_SIZE, activation='softmax')(post_attention_rnn)

# FINAL MODEL
model = tf.keras.Model(inputs=[encoder_input, decoder_input], outputs=logits)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def compile_and_train(model, epochs=40, steps_per_epoch=3200):
    
    early_stopping = EarlyStopping(
        monitor='val_loss',  
        patience=4,          
        restore_best_weights=True  
    )

    model.compile(optimizer="adam", loss='sparse_categorical_crossentropy', metrics=["accuracy"])

    history = model.fit(
        train_dataset.repeat(),
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_dataset,
        validation_steps=350,
        callbacks=[early_stopping]  
    )

    return model, history

# TRAINING

In [None]:
# Training
trained_translator, history = compile_and_train(model)

In [None]:
trained_translator.save('/kaggle/working/best_model.keras')

In [None]:
model = load_model('/kaggle/working/best_model.keras')

# INFERENCE

In [None]:
def greedy_decode(input_sequence, model, tokenizer_target, max_length=50):

    input_sequence = tf.constant([input_sequence], dtype=tf.int64)

    start_token = tokenizer_vi.cls_token_id
    end_token = tokenizer_vi.sep_token_id

    target_sequence = [start_token]

    for _ in range(max_length):
        decoder_input = tf.constant([target_sequence], dtype=tf.int64)

        predictions = model.predict([input_sequence, decoder_input], verbose=0)

        next_token = tf.argmax(predictions[:, -1, :], axis=-1).numpy()[0]

        target_sequence.append(next_token)

        if next_token == end_token:
            break

    translated_sentence = tokenizer_target.decode(target_sequence[1:], skip_special_tokens=True)
    return translated_sentence

In [None]:
en_sentence = "I go to school"

input_tokens = tokenizer_en.encode(en_sentence, add_special_tokens=True)

translated_sentence = greedy_decode(input_tokens, model, tokenizer_vi)

print("Input Sentence:", en_sentence)
print("Translated Sentence:", translated_sentence)