# Import thư viện, tiền xử lí dữ liệu

In [6]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import io
import string
import re
from sklearn.model_selection import train_test_split

print(tf.__version__)

# dùng eager mode trong tensor    
tf.enable_eager_execution()

def preprocess(sentence):
    sent = sentence.lower()
    sent = sent.strip()
    sent = re.sub("'"," ",sent)
    sent = re.sub("\s+"," ",sent)
    sent = ''.join([char for char in sent if char not in exclude])
    sent = "<start> " + sent + " <end>"
    return sent

# load data
en_filename = "./dataset/train.en.txt"
vi_filename = "./dataset/train.vi.txt"

raw_en_lines = open(en_filename, encoding='utf-8').read().strip().split("\n")
raw_vi_lines = open(vi_filename, encoding='utf-8').read().strip().split("\n")
exclude = list(string.punctuation) + list(string.digits)

en_lines = []
vi_lines = []
min_len, max_len = 10, 14

# với mục đích demo, mình sẽ không xử lí những câu quá dài,
# loại bỏ những câu có độ dài không nằm trong khoảng (min_len, max_len)
for eline, vline in zip(raw_en_lines, raw_vi_lines):
    eline = preprocess(eline)
    vline = preprocess(vline)
    if min_len < len(eline.split(" ")) < max_len and min_len < len(vline.split(" ")) < max_len:
        en_lines.append(eline)
        vi_lines.append(vline)

1.15.0-dev20190803


In [7]:
print(en_lines)

['<start> rachel pike  the science behind a climate headline <end>', '<start> they wrote almost a thousand pages on the topic  <end>', '<start> christopher decharms  a look inside the brain in real time <end>', '<start> i aposm going to tell you about that technology  <end>', '<start> but i want to leave with you the big question  <end>', '<start> and they themselves had not known they cared  <end>', '<start> i was past  when my father died  <end>', '<start> but within the reels lie purpose and meaning  <end>', '<start> let me take you on a little tour  <end>', '<start> i aposm not going to minimize those concerns  <end>', '<start> geert chatrou  a whistleblower you haven apost heard <end>', '<start> so  okay  i aposm going to america  <end>', '<start> well actually  you are going to whistle along  <end>', '<start> geert chatrou  thank you  thank you  <end>', '<start> so he aposs mario  he aposs our son  <end>', '<start> but in the end everything seemed to be under control  <end>', '<s

In [9]:
print(vi_lines)

['<start> khoa học đằng sau một tiêu đề về khí hậu <end>', '<start> họ viết gần  trang về chủ đề này  <end>', '<start> christopher decharms quét não bộ theo thời gian thực <end>', '<start> tôi sẽ cho bạn biết về công nghệ đó  <end>', '<start> nhưng tôi có một câu hỏi lớn dành cho bạn  <end>', '<start> và chúng không tự biết rằng chúng quan tâm  <end>', '<start> tôi đã hơn  khi bố tôi mất  <end>', '<start> nhưng trong những cuộn phim là mục đích và ý nghĩa <end>', '<start> hãy để tôi đưa các bạn thăm quan một chút  <end>', '<start> tôi sẽ không coi thường những quan ngại này  <end>', '<start> một người huýt gió bạn chưa từng biết đến <end>', '<start> đựoc thôi  tôi chuẩn bị được đi mỹ  <end>', '<start> thật ra  bạn sẽ huýt gió cùng tôi <end>', '<start> geert chatrou  cám ơn  cám ơn  <end>', '<start> đây là mario  con trai chúng tôi  <end>', '<start> nhưng cuối cùng thì mọi việc dường như đã ổn  <end>', '<start> nên bé sinh ra được đủ cân nặng  <end>', '<start> đó thực sự là quãng thời g

# Xử lí ngôn ngữ với class Language

In [10]:
class Language():
    def __init__(self, lines):
        self.lines = lines
        self.word2id = {}
        self.id2word = {}
        self.vocab = set()
        self.max_len = 0
        self.min_len = 0
        self.vocab_size = 0
        self.init_language_params()
        
    def init_language_params(self):
        for line in self.lines:
            self.vocab.update(line.split(" "))
        self.word2id['<pad>'] = 0
        for id, word in enumerate(self.vocab):
            self.word2id[word] = id + 1
        for word, id in self.word2id.items():
            self.id2word[id] = word
        self.max_len = max([len(line.split(" ")) for line in self.lines])
        self.min_len = min([len(line.split(" ")) for line in self.lines])
        self.vocab_size = len(self.vocab) + 1
    
    def sentence_to_vector(self, sent):
        return np.array([self.word2id[word] for word in sent.split(" ")])
    
    def vector_to_sentence(self, vector):
        return " ".join([self.id2word[id] for id in vector])
        

# Tạo dữ liệu cho train, validate:

In [11]:
class Encode(tf.keras.Model):
    def __init__(self, embedding_size, vocab_size, hidden_units):
        super(Encode, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.GRU = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.hidden_units = hidden_units
    
    def call(self, x, hidden_state):
        x = self.Embedding(x)
        outputs, last_state = self.GRU(x, hidden_state)
        return outputs, last_state
    
    def init_hidden_state(self, batch_size):
        return tf.zeros([batch_size, self.hidden_units])

# Attention

In [12]:
class Attention(tf.keras.Model):
    def __init__(self, hidden_units):
        super(Attention, self).__init__()
        self.W_out_encode = tf.keras.layers.Dense(hidden_unit)
        self.W_state = tf.keras.layers.Dense(hidden_unit)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, encode_outs, pre_state):
        pre_state = tf.expand_dims(pre_state, axis=1)
        pre_state = self.W_state(pre_state)
        encode_outs = self.W_out_encode(encode_outs)
        score = self.V(
            tf.nn.tanh(
                pre_state + encode_outs_outs
            )
        )
        
        score = tf.nn.softmax(score, axis=1)
        context_vector = score*encode_outs_outs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, score
    

# Decode

In [18]:
class Decode(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_units):
        super(Decode, self).__init__()
        self.hidden_units = hidden_units
        self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
        self.Attention = Attention(hidden_units)
        self.GRU = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.Fc = tf.keras.layers.Dense(vocab_size)
            
    def call(self, x, encode_outs, pre_state):
        x = tf.expand_dims(x, axis=1)
        x = self.Embedding(x)
        context_vector, attention_weight = self.Attention(encode_outs, pre_state)
        context_vector = tf.expand_dims(context_vector, axis=1)
        gru_inp = tf.concat([x, context_vector], axis=-1)
        out_gru, state = self.GRU(gru_inp)
        out_gru = tf.reshape(out_gru, (-1, out_gru.shape[2]))
        return self.Fc(out_gru), state

# Loss function

In [19]:
def loss_function(real, pred):
    mask = 1-np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred)*mask
    return tf.reduce_mean(loss_)


# Tiến hành train model

In [20]:
EPOCHS = 20
optimizer = tf.train.AdamOptimizer()
encoder = Encode(embedding_size, vocab_size=inp_lang.vocab_size, hidden_units=hidden_unit)
decoder = Decode(vocab_size=tar_lang.vocab_size, embedding_size=embedding_size, hidden_units=hidden_unit)
    
for epoch in range(EPOCHS):
    total_loss = 0
    for batch_id, (x, y) in enumerate(dataset.take(N_BATCH)):
        loss = 0
        with tf.GradientTape() as tape:
            first_state = encoder.init_hidden_state(batch_size=BATCH_SIZE)
            encode_outs, last_state = encoder(x, first_state)
            decode_state = last_state
            decode_input = [tar_lang.word2id["<start>"]]*BATCH_SIZE
            
            for i in range(1, y.shape[1]):
                decode_out, decode_state = decoder(
                        decode_input, encode_outs, decode_state
                )
                loss += loss_function(y[:, i], decode_out)
                decode_input = y[:, i]
                
            train_vars = encoder.trainable_variables
                        + decoder.trainable_variables
            grads = tape.gradient(loss, train_vars)
            optimizer.apply_gradients(zip(grads, train_vars))
        total_loss += loss
    print(total_loss.numpy())

IndentationError: unexpected indent (<ipython-input-20-6a954caf0812>, line 24)

# Test model

In [21]:
def translate(inputs):
    result = ''
    hidden = encoder.init_hidden_state(batch_size=1)
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden  
    dec_input = [tar_lang.word2id['<start>']]
    
    for t in range(tar_lang.max_len):
        predictions, dec_hidden = decoder(dec_input, enc_out, dec_hidden)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += tar_lang.id2word[predicted_id] + ' '
        dec_input = [predicted_id]
    return result
  
for inp, tar in dataset.take(N_BATCH):
    print(translate(inp[1:2,:]))
    break

W0810 14:57:33.858738  2200 module_wrapper.py:136] From C:\Users\nguyen.pv162992\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow_core\python\util\module_wrapper.py:163: The name tf.estimator.inputs is deprecated. Please use tf.compat.v1.estimator.inputs instead.



NameError: name 'dataset' is not defined