In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers

tf.__version__

'2.10.1'

In [None]:
# Turn on mixed precision training
from tensorflow.keras import mixed_precision

# set global policy to mixed precision
mixed_precision.set_global_policy(policy="mixed_float16") 

# should output "mixed_float16" (if your GPU is compatible with mixed precision)
mixed_precision.global_policy() 

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3090, compute capability 8.6


<Policy "mixed_float16">

## MNT Model to convert English sentences to Vietnamese.

In [None]:
## Getting Datasets
# !curl -O -J https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.en
# !curl -O -J https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.vi

# !curl -O -J https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/vocab.en
# !curl -O -J https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/vocab.vi

In [None]:
max_sentences = 50000

### Getting Engish Sentences

In [None]:
en_sentences = []

with open('./train.en', 'r', encoding='utf-8') as file:
    for i, each_line in enumerate(file):
        
        if i < 50:
            continue
            
        if i == max_sentences + 50:
            break
        # loai bo khoang trang dau va cuoi
        sentence_split_by_spaces = each_line.strip()
        en_sentences.append(sentence_split_by_spaces)

### Getting Vietnamese Sentences

In [None]:
vi_sentences = []

with open('./train.vi', 'r', encoding='utf-8') as file:
    for i, each_line in enumerate(file):
        
        if i < 50:
            continue
            
        if i == max_sentences + 50:
            break
        
        sentence_split_by_spaces = each_line.strip()
        vi_sentences.append(sentence_split_by_spaces)

### Insertings tags on the start & end of each sentences

In [None]:
en_sentences = ['<s> ' + sentence.strip() + ' </s>' for sentence in en_sentences]

vi_sentences = ['<s> ' + sentence.strip() + ' </s>' for sentence in vi_sentences]

### Getting samples of sentences

In [None]:
for en, vi in zip(en_sentences[:2], vi_sentences[:2]):
    print(en)
    print("->")
    print(vi)
    print("\n\n")

<s> In each one of those assessments that we write , we always tag on a summary , and the summary is written for a non-scientific audience . </s>
->
<s> Trong mỗi bản đánh giá chúng tôi viết , chúng tôi luôn đính kèm một bản tóm lược , được viết cho những độc giả không chuyên về khoa học . </s>



<s> And we hand that summary to journalists and policy makers , in order to make headlines like these . </s>
->
<s> Chúng tôi đưa bản tóm lược cho các nhà báo và nhà chính sách để có được những dòng tít như thế này . </s>





### Splitting Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_en_sentences, valid_test_en_sentences, train_vi_sentences, valid_test_vi_sentences = train_test_split(en_sentences, 
                                                                                                            vi_sentences,
                                                                                                            test_size=0.2,
                                                                                                            shuffle=True)

In [None]:
valid_en_sentences, test_en_sentences, valid_vi_sentences, test_vi_sentences = train_test_split(valid_test_en_sentences,
                                                                                               valid_test_vi_sentences,
                                                                                               test_size=0.5,
                                                                                               shuffle=True)

In [None]:
print(f"Shape of Train set: {len(train_en_sentences)}")
print(f"Shape of Valid set: {len(valid_en_sentences)}")
print(f"Shape of Test set: {len(test_en_sentences)}")

Shape of Train set: 40000
Shape of Valid set: 5000
Shape of Test set: 5000


### Getting statistics on How long each sentences are

In [None]:
print("Lengths of English Sentences:")
pd.Series(train_en_sentences).str.split(" ").apply(len).describe(percentiles=[0.5, 0.75, 0.95])

Lengths of English Sentences:


count    40000.000000
mean        22.005050
std         14.303296
min          3.000000
50%         18.000000
75%         28.000000
95%         48.000000
max        630.000000
dtype: float64

In [None]:
n_en_seq = 49

In [None]:
print("Lengths of Vietnamese Sentences:")
pd.Series(train_vi_sentences).str.split(" ").apply(len).describe(percentiles=[0.5, 0.75, 0.95])

Lengths of Vietnamese Sentences:


count    40000.000000
mean        26.611050
std         18.136954
min          3.000000
50%         22.000000
75%         33.000000
95%         60.000000
max        852.000000
dtype: float64

In [None]:
n_vi_seq = 61

### Finding Unique number of Vocabs 

In [None]:
print("English Vocab:")

en_vocab = []

with open("./vocab.en", "r", encoding='utf-8') as file:
    for i, each_word in enumerate(file):
        
        # Removing of the unk token
        if i == 0:
            continue
        
        en_vocab.append(each_word.strip())

        
n_en_vocab = len(en_vocab)
print(f"Samples of English words: {en_vocab[:10]}")
print(f"Size of Engish vocab {n_en_vocab}")

English Vocab:
Samples of English words: ['<s>', '</s>', 'Rachel', ':', 'The', 'science', 'behind', 'a', 'climate', 'headline']
Size of Engish vocab 17190


In [None]:
print("Vietnamese Vocab:")

vi_vocab = []

with open("./vocab.vi", "r", encoding='utf-8') as file:
    for i, each_word in enumerate(file):
        
        # Removing of the unk token
        if i == 0:
            continue
        
        vi_vocab.append(each_word.strip())

n_vi_vocab = len(vi_vocab)
print(f"Samples of Vietnamese words: {vi_vocab[:10]}")
print(f"Size of Vietnamese vocab {n_vi_vocab}")

Vietnamese Vocab:
Samples of Vietnamese words: ['<s>', '</s>', 'Khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về']
Size of Vietnamese vocab 7708


### Training TextVectorizer Layer

In [None]:
encoder_text_vectorizer = layers.TextVectorization(max_tokens=n_en_vocab,
                                                  standardize=None,
                                                  split='whitespace',
                                                  output_sequence_length=n_en_seq,
                                                  name="encoder_text_vectorizer_layer")
## Needs to be a numPy array 
tmp = np.array(train_en_sentences)

encoder_text_vectorizer.adapt(tmp)
# xây dựng bộ từ vựng từ các chuỗi văn bản trong mảng numpy

In [None]:
example = "<s> This is a cat </s>"

n_en_vocab = len( encoder_text_vectorizer.get_vocabulary() )

print(f"Tokenized Form of \"{example}\":\n\n{encoder_text_vectorizer([example])}\n")
print(f"Samples from layer: {encoder_text_vectorizer.get_vocabulary()[:20]}")

Tokenized Form of "<s> This is a cat </s>":

[[   3   90   14    9 3905    4    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]]

Samples from layer: ['', '[UNK]', ',', '<s>', '</s>', '.', 'the', 'to', 'of', 'a', 'and', 'that', 'I', 'in', 'is', 'you', 'it', '&apos;s', 'we', 'And']


In [None]:
decoder_text_vectorizer = layers.TextVectorization(max_tokens=n_vi_vocab,
                                                  standardize=None,
                                                  split='whitespace',
                                                  output_sequence_length=n_vi_seq - 1,
                                                  name="decoder_text_vectorizer_layer")
## Needs to be a numPy array
tmp = np.array(train_vi_sentences)

decoder_text_vectorizer.adapt(tmp)

In [None]:
example = "<s> Chúng tôi đưa bản tóm </s>"

n_vi_vocab = len( decoder_text_vectorizer.get_vocabulary() )

print(f"Tokenized Form of \"{example}\":\n\n{decoder_text_vectorizer([example])}\n")
print(f"Samples from layer: {decoder_text_vectorizer.get_vocabulary()[:20]}")

Tokenized Form of "<s> Chúng tôi đưa bản tóm </s>":

[[   2   74    7  259  130 1421    3    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]]

Samples from layer: ['', '[UNK]', '<s>', '</s>', ',', '.', 'là', 'tôi', 'một', 'có', 'và', 'những', 'chúng', 'của', 'ta', 'không', 'bạn', 'đó', 'người', 'trong']


### Creating a seq2seq model

In [None]:
## Encoder
encoder_input = layers.Input(shape=(1,) , dtype=tf.string, name="encoder_input")

x = encoder_text_vectorizer(encoder_input)
x = layers.Embedding(input_dim=n_en_vocab, output_dim=512, mask_zero=True, name="encoder_embedding")(x)
x = layers.GRU(256, return_sequences=True, name="encoder_gru_1")(x)
x = layers.GRU(256, return_sequences=True, name="encoder_gru_2")(x)

encoder_gru_last_layer, encoder_gru_last_state = layers.GRU(256, return_sequences=True, return_state=True, name="encoder_gru_last")(x)


encoder_model = tf.keras.models.Model(inputs=encoder_input, outputs=encoder_gru_last_layer, name="encoder_model")

## Decoder
decoder_input = layers.Input(shape=(1,), dtype=tf.string, name="decoder_input")

x = decoder_text_vectorizer(decoder_input)
x = layers.Embedding(input_dim=n_vi_vocab, output_dim=512, mask_zero=True, name="decoder_embedding")(x)
x = layers.GRU(256, return_sequences=True, name="decoder_gru_1")(x, initial_state=encoder_gru_last_state)
x = layers.GRU(256, return_sequences=True, name="decoder_gru_2")(x)
x = layers.GRU(256, return_sequences=True, name="decoder_gru_last")(x)
x = layers.Dropout(0.5)(x)

decoder_out = layers.Dense(n_vi_vocab, activation='softmax')(x)


seq2seq = tf.keras.models.Model(inputs=[encoder_model.inputs, decoder_input], outputs=decoder_out)
seq2seq.compile(optimizer='adam', loss=tf.keras.losses.sparse_categorical_crossentropy, metrics=['accuracy'])

In [None]:
seq2seq.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 encoder_text_vectorizer_layer   (None, 49)          0           ['encoder_input[0][0]']          
 (TextVectorization)                                                                              
                                                                                                  
 encoder_embedding (Embedding)  (None, 49, 512)      8801280     ['encoder_text_vectorizer_layer[0
                                                                 ][0]']                           
                                                                                              

### Preparing data for model

In [None]:
def prepare_data(X, y, tensor=False, batch_size=128):
    
    encoder_input =  np.array( X )
    decoder_input = np.array( [" ".join(sentence.split(" ")[:-1]) for sentence in y] )
    
    decoder_labels = [" ".join(sentence.split(" ")[1:]) for sentence in y]
    decoder_labels = decoder_text_vectorizer(decoder_labels).numpy()
    
    if tensor:
        encoder_input = tf.data.Dataset.from_tensor_slices(encoder_input)
        decoder_input = tf.data.Dataset.from_tensor_slices(decoder_input)
        decoder_labels = tf.data.Dataset.from_tensor_slices(decoder_labels)
        
        inputs = tf.data.Dataset.zip( (encoder_input, decoder_input) )
        labels = tf.data.Dataset.zip( (inputs, decoder_labels) ).batch(batch_size=batch_size).prefetch(tf.data.AUTOTUNE).cache()
        
        return labels, None
    
    return (encoder_input, decoder_input), decoder_labels

In [None]:
batch_size = 512

In [None]:
train_features, train_labels = prepare_data(train_en_sentences, train_vi_sentences, tensor=True)
valid_features, valid_labels = prepare_data(valid_en_sentences, valid_vi_sentences, tensor=True)
test_features, test_labels = prepare_data(test_en_sentences, test_vi_sentences, tensor=True)

### Training the model

In [None]:
epochs = 100

early_callback = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
learning_rate_callback =  tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto', min_delta=0.0001, min_lr=0.000001)

history = seq2seq.fit(train_features,
                      epochs=epochs, 
                      batch_size=batch_size,
                     validation_batch_size=batch_size,
                     validation_data=valid_features,
                     callbacks=[early_callback, learning_rate_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 34: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.


In [None]:
seq2seq.evaluate(test_features)



[1.6858359575271606, 0.30438748002052307]

In [None]:
seq2seq.save("./saved_models/seq2seq/")



INFO:tensorflow:Assets written to: ./saved_models/seq2seq/assets


INFO:tensorflow:Assets written to: ./saved_models/seq2seq/assets


### Inference Model

In [None]:
## Encoder
encoder_input = layers.Input(shape=(1,) , dtype=tf.string, name="encoder_input")

x = seq2seq.get_layer("encoder_text_vectorizer_layer")(encoder_input)
x = seq2seq.get_layer("encoder_embedding")(x)
x = seq2seq.get_layer("encoder_gru_1")(x)
x = seq2seq.get_layer("encoder_gru_2")(x)

encoder_gru_last_layer, encoder_gru_last_state = seq2seq.get_layer("encoder_gru_last")(x)


encoder_model = tf.keras.models.Model(inputs=encoder_input, outputs=[encoder_gru_last_layer, encoder_gru_last_state], name="encoder_model")


## Decoder
decoder_input = layers.Input(shape=(1,), dtype=tf.string, name="decoder_input")
decoder_encoder_state = layers.Input(shape=(256,))

x = seq2seq.get_layer("decoder_text_vectorizer_layer")(decoder_input)
x = seq2seq.get_layer("decoder_embedding")(x)

gru_1 = layers.GRU(256, return_sequences=True, name="decoder_gru_1", dtype='float')
x = gru_1(x, initial_state=decoder_encoder_state)

gru_2 = layers.GRU(256, return_sequences=True, name="decoder_gru_2", dtype='float')
x = gru_2(x, initial_state=decoder_encoder_state)

decoder_gru_last_layer = layers.GRU(256, return_sequences=True, name="decoder_gru_last")
gru_out = decoder_gru_last_layer(x)

decoder_out = seq2seq.get_layer("dense")(gru_out)


inference_model = tf.keras.models.Model(inputs=[decoder_input, decoder_encoder_state], outputs=[decoder_out, gru_out])
inference_model.compile()

gru_1.set_weights(seq2seq.get_layer("decoder_gru_1").get_weights())
gru_2.set_weights(seq2seq.get_layer("decoder_gru_2").get_weights())
decoder_gru_last_layer.set_weights(seq2seq.get_layer("decoder_gru_last").get_weights())

In [None]:
inference_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 decoder_text_vectorizer_layer   (None, 60)          0           ['decoder_input[0][0]']          
 (TextVectorization)                                                                              
                                                                                                  
 decoder_embedding (Embedding)  (None, 60, 512)      3946496     ['decoder_text_vectorizer_layer[1
                                                                 ][0]']                           
                                                                                            

In [None]:
# inference_model.save("/saved_models/inference_model/")

In [None]:
word_ids_to_word = {key:value for key, value in enumerate(decoder_text_vectorizer.get_vocabulary())}

In [None]:
def generate_translation(english_text):
    
    #Generate content vector
    encoder_gru_last_layer, content_vector = encoder_model( np.array(english_text) )
    
    #y_pred. Starting value <s>
    y_pred = np.array([["<s>"]])
    predicted_sentence = ['<s>']
    
    for _ in range(100):
        
        if y_pred[0][0] == '</s>':
            break
            
        softmax_probability, content_vector = inference_model.predict([y_pred, content_vector], verbose=0)
        # why
        content_vector = content_vector[:, 0, :]
        # Perform argmax on the last axis & get the highest value
        softmax_prediction = np.argmax(softmax_probability, axis=-1).ravel()[0]
        y_pred = np.array([[word_ids_to_word[softmax_prediction]]])
        
        predicted_sentence.append( word_ids_to_word[softmax_prediction] )
    
    return " ".join(predicted_sentence)

In [None]:
text = '<s> welcome everyone </s>'

generate_translation([text])

'<s> Xin thể Schwarzchild lại tôi </s>'