In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/phomt-dl-2023-1/PhoMT_json/detokenization/test/test.json
/kaggle/input/phomt-dl-2023-1/PhoMT_json/detokenization/train/train.json
/kaggle/input/phomt-dl-2023-1/PhoMT_json/detokenization/dev/dev.json
/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/test/test.json
/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/train/train.json
/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/dev/dev.json


In [2]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from keras.callbacks import EarlyStopping



In [3]:
try:
    import wandb
except:
    ! pip install wandb
import wandb

wandb.login(
    key = "cf3d72434022f1be5f7f85f08bff3743edefdf49"
)
wandb.init("Machine_translation_DL_2023.1")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlengocbinh09062003[0m ([33mhust_dsai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
config = {
    "train_data_file": "/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/train/train.json",
    "dev_data_file": "/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/dev/dev.json",
    "test_data_file": "/kaggle/input/phomt-dl-2023-1/PhoMT_json/tokenization/test/test.json",
    "small_train_data": 20000,
}

In [5]:
def load_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    data_20k = {}
    for i in range(config["small_train_data"]):
        tmp = data["data"][i]["translation"]
        data_20k[tmp['en'].lower()] = tmp['vi'].lower()
    english_sentences = list(data_20k.keys())
    vietnamese_sentences = list(data_20k.values())
    return english_sentences, vietnamese_sentences

english_sentences, vietnamese_sentences = load_data(config["train_data_file"])

In [6]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    # TODO: Implement
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [7]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    return pad_sequences(x, maxlen=length, padding='post')

In [8]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_vietnamese_sentences, english_tokenizer, vietnamese_tokenizer = preprocess(english_sentences, vietnamese_sentences)

In [9]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [10]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, vietnamese_vocab_size):
    # Hyperparameters
    learning_rate = 0.005
    
    # TODO: Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256, return_sequences=True))    
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(vietnamese_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [11]:
def train_model(preproc_english_sentences, preproc_vietnamese_sentences, english_tokenizer, vietnamese_tokenizer, epochs):
    max_english_sequence_length = preproc_english_sentences.shape[1]
    max_vietnamese_sequence_length = preproc_vietnamese_sentences.shape[1]
    english_vocab_size = len(english_tokenizer.word_index) + 1
    vietnamese_vocab_size = len(vietnamese_tokenizer.word_index) + 1

    # Build and compile the model
    tmp_x = pad(preproc_english_sentences, max_vietnamese_sequence_length)
    tmp_x = tmp_x.reshape((-1, preproc_vietnamese_sentences.shape[-2]))
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# TODO: Train the neural network
    model = embed_model(
        tmp_x.shape,
        preproc_vietnamese_sentences.shape[1],
        english_vocab_size,
        vietnamese_vocab_size)

    model.summary()

    model.fit(tmp_x, 
             preproc_vietnamese_sentences, 
             batch_size=512, 
             epochs=epochs, 
             validation_split=0.2,
             callbacks=[wandb.keras.WandbCallback(), early_stopping]
             )
    return model

In [12]:
model = train_model(preproc_english_sentences, preproc_vietnamese_sentences, english_tokenizer, vietnamese_tokenizer, epochs=50)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 169, 256)          4374016   
                                                                 
 gru (GRU)                   (None, 169, 256)          394752    
                                                                 
 time_distributed (TimeDist  (None, 169, 1024)         263168    
 ributed)                                                        
                                                                 
 dropout (Dropout)           (None, 169, 1024)         0         
                                                                 
 time_distributed_1 (TimeDi  (None, 169, 6841)         7012025   
 stributed)                                                      
                                                                 
Total params: 12043961 (45.94 MB)
Trainable params: 1204



Epoch 1/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 2/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 3/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 4/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 5/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.7s


Epoch 6/50
Epoch 7/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 8/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.5s


Epoch 9/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.5s


Epoch 10/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.5s


Epoch 11/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.5s


Epoch 12/50
Epoch 13/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 14/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 15/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.5s


Epoch 16/50
Epoch 17/50

  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20231219_212515-b9wmchvl/files/model-best)... Done. 0.6s


Epoch 18/50
Epoch 19/50
Epoch 20/50


In [13]:
def load_test_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
        
    data_20k = {}
    max_length = 169
    
    for i in range(19152):
        tmp = data["data"][i]["translation"]
        data_20k[tmp['en'].lower()] = tmp['vi'].lower()
        filtered_data = {key: value for key, value in data_20k.items() if len(value) <= max_length or len(key) <= max_length}
    english_sentences = list(data_20k.keys())
    vietnamese_sentences = list(data_20k.values())
    
    return english_sentences, vietnamese_sentences

In [14]:
test_en_sen, test_vi_sen = load_test_data(config["test_data_file"])


In [15]:
print(test_vi_sen[0])

anh albert barnett và chị susan barnett , thuộc hội thánh west ở tuscaloosa , alabama


In [16]:
test_preproc_en, test_preproc_vi, test_en_token, test_vi_token = preprocess(test_en_sen, test_vi_sen)

In [17]:
def translate1(test_preproc_en, test_preproc_vi, model, test_vi_token):
    translated_vi_sen = list()
    tmp_x = pad(test_preproc_en, test_preproc_vi.shape[1])
    tmp_x = tmp_x.reshape(-1, test_preproc_vi.shape[-2])
    for i in range(19111):
        translated = logits_to_text(model.predict(tmp_x[i : i+1])[0], test_vi_token)
        translated = translated.replace("<PAD>", "")
        translated_vi_sen.append(translated)
    return translated_vi_sen

In [19]:
tmp_x1 = pad(preproc_english_sentences, preproc_vietnamese_sentences.shape[1])
tmp_x1 = tmp_x1.reshape((-1, preproc_vietnamese_sentences.shape[-2]))

print(tmp_x1[:1][0])
print(logits_to_text(model.predict(tmp_x1[:1])[0], vietnamese_tokenizer))


[   10  1479    23     5 10000     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [20]:
tmp_x = pad(test_preproc_en,  test_preproc_vi.shape[1])
tmp_x = tmp_x.reshape(-1,  test_preproc_vi.shape[-2])
#predictions = model.predict(tmp_x)
#print(predictions[0])
#print("-------------")
print(tmp_x[:1][0])
print(logits_to_text(model.predict(tmp_x[:1])[0], test_vi_token))
print(test_en_sen[0])
print(test_vi_sen[0])


[  517  4447  5206     3    60   312  1128  7898  5206    35     1   553
 11120     7 11121 11122     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [None]:
translated = translate1(test_preproc_en, test_preproc_vi, model, test_vi_token)



In [25]:
!pip install sacrebleu
from datasets import load_metric
import time

def eval_bleu(translated_sentences,
              vi_sentences
              ):
    t1 = time.time()
    bleu_metric = load_metric("sacrebleu")

    for translated_sen, reference in zip(translated_sentences, vi_sentences):
        bleu_metric.add(prediction = translated_sen, reference = [reference])
    
    result = bleu_metric.compute()
    t2 = time.time()
    print(f"Đã tính bleu score xong!\nTime = {t2 - t1} ")
    print(f"Bleu score = {result['score']}")

    return result

Collecting sacrebleu
  Obtaining dependency information for sacrebleu from https://files.pythonhosted.org/packages/de/ea/025db0a39337b63d4728a900d262c39c3029b0fe76a9876ce6297b1aa6a0/sacrebleu-2.4.0-py3-none-any.whl.metadata
  Downloading sacrebleu-2.4.0-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Obtaining dependency information for portalocker from https://files.pythonhosted.org/packages/17/9e/87671efcca80ba6203811540ed1f9c0462c1609d2281d7b7f53cef05da3d/portalocker-2.8.2-py3-none-any.whl.metadata
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portal

In [26]:
result = eval_bleu(translated, test_vi_sen)
print(result)

Đã tính bleu score xong!
Time = 7.255246877670288 
Bleu score = 0.014285364268786916
{'score': 0.014285364268786916, 'counts': [23478, 269, 2, 0], 'totals': [312973, 293866, 274777, 255744], 'precisions': [7.501605569809536, 0.09153832018675179, 0.0007278629579622748, 0.00019550800800800802], 'bp': 0.8079760468644368, 'sys_len': 312973, 'ref_len': 379706}


In [27]:
tf.keras.models.save_model(model, "/kaggle/working/save_model.pth")