In [8]:
import tensorflow as tf
import numpy as np
import warnings
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
warnings.filterwarnings('ignore')

# 英法翻译的实现

In [11]:
#读取数据
with open('./data/small_vocab_en', 'r') as f:
    english_data = f.read()
f.close()
with open('./data/small_vocab_fr', 'r') as f:
    french_data = f.read()
f.close()

In [16]:
#简单分析下
english_sentences = english_data.split('\n')
french_sentences = french_data.split('\n')
print('the num of english sentences is {}'.format(len(english_sentences)))
print('the num of french sentences is {}'.format(len(french_sentences)))
english_word_max_number = max([len(sentence.split()) for sentence in english_sentences])
french_word_max_number = max([len(sentence.split()) for sentence in french_sentences])
print('the max word number of english sentences is {}'.format(english_word_max_number))
print('the max word number of french sentences is {}'.format(french_word_max_number))

the num of english sentences is 137861
the num of french sentences is 137861
the max word number of english sentences is 17
the max word number of french sentences is 23


# 特殊字符
    <PAD>：由于翻译问题的特殊性，我们的句子长度往往是不一致的，而在RNN处理batch数据时，我们需要保证batch中的句子长度一致，此时需要通过<PAD>对长度不足的句子进行补全；
    <UNK>：Unknown字符，用来处理模型未见过的生僻单词；
    <GO>：翻译句子时，用来告诉句子开始进行翻译。仅在target中使用；
    <EOS>：翻译句子时，用来告诉句子结束翻译。仅在target中使用

In [18]:
#构建语料库，将English_data 和 french_data 转换成数字格式
english_vocab = list(set(english_data.lower().split()))
french_vocab = list(set(french_data.lower().split()))
print('the size of english vocab is {}'.format(len(english_vocab)))
print('the size of french vocab is {}'.format(len(french_vocab)))
source_special_words = ['<PAD>', '<UNK>']
target_special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
#英语是source， 法语为target
source_int_to_word = {idx:word for idx, word in enumerate(english_vocab + source_special_words)}
source_word_to_int = {word:idx for idx, word in source_int_to_word.items()}

target_int_to_word = {idx:word for idx, word in enumerate(french_vocab + target_special_words)}
target_word_to_int = {word:idx for idx, word in target_int_to_word.items()}


the size of english vocab is 227
the size of french vocab is 354


In [24]:
#接下来进行数据格式的转换，将其转换成数字格式
def text_to_int(sentence, map_dict, max_length=20, is_target=True):
    '''
    sentence:输入的是一个句子，
    map_dict:是存储word 和 int 转换的字典
    max_length：应为LSTM需要处理定长数据，给定一个最大长度，未到达这个长度的就进行补<PAD>操作
    is_target:用来确定是source还是target
    '''
    sentence_with_int = []
    if not is_target:
        for word in sentence.lower().split():
            sentence_with_int.append(map_dict.get(word, map_dict['<UNK>']))
    else:
        for word in sentence.lower().split():
            sentence_with_int.append(map_dict.get(word, map_dict['<UNK>']))
        sentence_with_int.append(map_dict['<EOS>'])
    if len(sentence_with_int) > max_length:
        return sentence_with_int[:max_length]
    else:
        return sentence_with_int + [map_dict['<PAD>']] * (max_length - len(sentence_with_int))
        

In [25]:
source_text_to_int = []
for sentence in english_data.split('\n'):
    source_text_to_int.append(text_to_int(sentence, source_word_to_int, max_length=20, is_target=False))
target_text_to_int = []
for sentence in french_data.split('\n'):
    target_text_to_int.append(text_to_int(sentence, target_word_to_int, max_length=25, is_target=True))

In [26]:
len(source_text_to_int)
len(target_text_to_int)
source_text_to_int[:10]
target_text_to_int[:10]

137861

137861

[[161,
  189,
  218,
  59,
  83,
  220,
  20,
  166,
  156,
  73,
  218,
  202,
  163,
  210,
  160,
  227,
  227,
  227,
  227,
  227],
 [13,
  126,
  11,
  218,
  207,
  19,
  220,
  103,
  166,
  156,
  73,
  218,
  207,
  53,
  163,
  92,
  160,
  227,
  227,
  227],
 [148,
  218,
  207,
  83,
  220,
  221,
  166,
  156,
  73,
  218,
  207,
  62,
  163,
  58,
  160,
  227,
  227,
  227,
  227,
  227],
 [13,
  126,
  11,
  218,
  59,
  206,
  220,
  58,
  166,
  156,
  73,
  218,
  175,
  163,
  3,
  160,
  227,
  227,
  227,
  227],
 [135,
  49,
  51,
  35,
  218,
  13,
  90,
  166,
  29,
  180,
  49,
  51,
  218,
  13,
  208,
  160,
  227,
  227,
  227,
  227],
 [52,
  21,
  35,
  218,
  13,
  25,
  166,
  29,
  180,
  21,
  218,
  13,
  90,
  160,
  227,
  227,
  227,
  227,
  227,
  227],
 [142,
  218,
  119,
  220,
  32,
  166,
  29,
  73,
  218,
  207,
  19,
  163,
  103,
  160,
  227,
  227,
  227,
  227,
  227,
  227],
 [161,
  189,
  218,
  183,
  220,
  14,
  166,
  156,
 

[[327,
  248,
  206,
  292,
  238,
  16,
  265,
  37,
  233,
  299,
  115,
  206,
  22,
  167,
  27,
  57,
  357,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354],
 [52,
  11,
  206,
  337,
  105,
  167,
  264,
  233,
  299,
  115,
  249,
  91,
  167,
  298,
  57,
  357,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354],
 [219,
  206,
  337,
  238,
  167,
  134,
  233,
  299,
  115,
  206,
  337,
  31,
  167,
  273,
  57,
  357,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354],
 [52,
  11,
  206,
  292,
  118,
  167,
  273,
  233,
  299,
  115,
  311,
  105,
  167,
  208,
  57,
  357,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354],
 [287,
  284,
  140,
  99,
  206,
  328,
  150,
  233,
  149,
  236,
  284,
  140,
  206,
  253,
  221,
  57,
  357,
  354,
  354,
  354,
  354,
  354,
  354,
  354,
  354],
 [26,
  99,
  258,
  206,
  204,
  233,
  149,
  236,
  258,
  206,
  328,
  150,
  57,
  357,
  354,
  354,
  354,
  354,
  354,
  354,

In [27]:
#构建模型输入
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.placeholder(tf.int32, (None,), name='max_target_sequence_length')
    return inputs, targets, learning_rate, source_sequence_length, target_sequence_length, max_target_sequence_length

In [28]:
#构造encoder
def encoder(input_data, rnn_size, layer_nums, 
           source_sequence_length, source_vocab_size, encoder_embedding_dim=100):
    encoder_embeded = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoder_embedding_dim)
    
    def get_lstm_cell(rnn_size):
        return tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2018))
    multi_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(layer_nums)])
    
    encoder_output, encoder_state = tf.nn.dynamic_rnn(multi_cell, encoder_embeded, 
                                                      source_sequence_length, dtype=tf.float32)
    return encoder_output, encoder_state

In [39]:
def decoder_layer_input(target_input, target_vocab_to_int, batch_size):
    ending = tf.strided_slice(target_input, [0,0], [batch_size, -1], [1,1])
    decoder_input = tf.concat([tf.fill([batch_size,1], target_vocab_to_int['<GO>']), ending], 1)
    return decoder_input

In [56]:
def decoder_train(encoding_states, decoder_cell, decoder_embeded,
                 target_sequence_length, max_target_sequence_length, output_layer):
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embeded, 
                                                        sequence_length=target_sequence_length,time_major=False)
    training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper, 
                                                       initial_state=encoding_states, output_layer=output_layer)
    training_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder, 
                                                impute_finished=True, maximum_iterations=max_target_sequence_length)
    return training_decoder_outputs

def decoder_predict(encoding_states, decoder_cell, decoder_embeded, start_id, end_id, batch_size, 
                    target_sequence_length, max_target_sequence_length, output_layer):
    start_tokens = tf.tile(tf.constant([start_id], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=decoder_embeded, 
                                                                start_tokens=start_tokens, end_token=end_id)
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=inference_helper, 
                                                        initial_state=encoding_states, output_layer=output_layer)
    inference_decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder, 
                                                                        impute_finished=True, 
                                                                        maximum_iterations=max_target_sequence_length)
    return inference_decoder_outputs

In [57]:
def decoder(decoder_input, encoder_states, rnn_size, layer_nums,
            target_sequence_length, max_target_sequence_length, 
            target_vocab_size, decoder_embedding_dim, target_vocab_to_int, batch_size):
    #embedding
    target_embedding = tf.Variable(tf.random_uniform([target_vocab_size, decoder_embedding_dim]))
    decoder_embeded = tf.nn.embedding_lookup(target_embedding, decoder_input)
    
    #lstm
    def get_lstm_cell(rnn_size):
        return tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2018))
    multi_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(layer_nums)])
    
    #输出每一个词语的概率值
    output_layer = tf.layers.Dense(target_vocab_size)
    
    with tf.variable_scope('decoder'):
        training_logits = decoder_train(encoder_states, multi_cell, 
                                                 decoder_embeded, target_sequence_length, 
                                                 max_target_sequence_length, output_layer)

    with tf.variable_scope('decoder', reuse=True):
        inference_logits = decoder_predict(encoder_states, multi_cell, target_embedding, 
                                                    target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'], 
                                                    batch_size, target_sequence_length, max_target_sequence_length,
                                                    output_layer)
    return training_logits, inference_logits
    

In [58]:
def seq2seq_model(input_data, target_data, rnn_size, layer_num, 
                 source_sequence_length, target_sequence_length, 
                 source_vocab_size, target_vocab_size, 
                 target_vocab_to_int, batch_size,
                 encoder_embedding_dim, decoder_embedding_dim, max_target_sequence_length):
    encoder_output, encoder_states = encoder(input_data, rnn_size, layer_nums, 
                                             source_sequence_length, source_vocab_size, encoder_embedding_dim)
    
    decoder_input = decoder_layer_input(target_data, target_vocab_to_int, batch_size)
    
    training_decoder_outputs, inference_decoder_outputs = decoder(decoder_input, encoder_states, rnn_size, layer_nums, 
                                                                 target_sequence_length, max_target_sequence_length,
                                                                 target_vocab_size, decoder_embedding_dim, 
                                                                 target_vocab_to_int, batch_size)
    return training_decoder_outputs, inference_decoder_outputs

In [59]:
epochs = 10
batch_size = 128
rnn_size = 128
layer_nums = 1
encoder_embedding_dim = 100
decoder_embedding_dim = 100
learning_rate = 0.001
display_step = 50


In [74]:
#构建图
train_graph = tf.Graph()
with train_graph.as_default():
    inputs, targets, lr, source_sequence_length, target_sequence_length, _ = get_inputs()
    
    max_target_sequence_length = 25
    
    training_decoder_outputs, inference_decoder_outputs = seq2seq_model(inputs, targets, rnn_size, layer_nums,
                                                                source_sequence_length, target_sequence_length,
                                                                len(source_word_to_int), len(target_word_to_int),
                                                                target_word_to_int, batch_size, 
                                                                encoder_embedding_dim, decoder_embedding_dim,
                                                                max_target_sequence_length)
    training_logits = tf.identity(training_decoder_outputs.rnn_output, name='logits')
    inference_logits = tf.identity(inference_decoder_outputs.sample_id, name='predictions')
    
    sequence_mask = tf.sequence_mask(target_sequence_length, 
                                     max_target_sequence_length, dtype=tf.float32, name='masks')
    
    with tf.name_scope('optimization'):
        loss = tf.contrib.seq2seq.sequence_loss(training_logits, targets, sequence_mask)
        
        optimizer = tf.train.AdamOptimizer(lr)
        
        gradients = optimizer.compute_gradients(loss)
        clipped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        training_op = optimizer.apply_gradients(clipped_gradients)
        

In [75]:
def get_batches(sources, targets, batch_size):
    for batch_i in range(len(sources)//batch_size):
        start = batch_i * batch_size
        source_batch = sources[start:start+batch_size]
        target_batch = targets[start:start+batch_size]
        
        target_lengths = []
        for target in target_batch:
            target_lengths.append(len(target))
        
        source_lengths = []
        for source in source_batch:
            source_lengths.append(len(source))
        
        yield source_batch, target_batch, source_lengths, target_lengths

In [83]:
train_source = source_text_to_int[batch_size:]
train_target = target_text_to_int[batch_size:]

valid_source = source_text_to_int[:batch_size]
valid_target = target_text_to_int[:batch_size]
(valid_source_batch, valid_target_batch, valid_source_lengths, valid_target_lengths) = next(get_batches(
    valid_source, valid_target, batch_size))

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        for batch_i, (source_batch, target_batch, source_lengths, target_lengths) in enumerate(
            get_batches(train_source, train_target, batch_size)):
            _, train_loss = sess.run([training_op, loss], feed_dict={
                inputs:source_batch,
                targets:target_batch,
                lr:learning_rate,
                source_sequence_length:source_lengths,
                target_sequence_length:target_lengths
            })
            if (batch_i+1) % display_step == 0:
                valid_loss = sess.run([loss], feed_dict={
                    inputs:valid_source_batch,
                    targets:valid_target_batch,
                    lr:learning_rate,
                    source_sequence_length:valid_source_lengths,
                    target_sequence_length:valid_target_lengths
                })
                print('Epoch {} Batch {} / {} - train loss : {} valid loss {}'.format(
                    epoch, batch_i+1, len(train_source) // batch_size, train_loss, valid_loss))
    saver = tf.train.Saver()
    saver.save(sess, 'model/checkpoints')

Epoch 0 Batch 50 / 1076 - train loss : 2.485210657119751 valid loss [2.404355]
Epoch 0 Batch 100 / 1076 - train loss : 2.124098777770996 valid loss [2.0634453]
Epoch 0 Batch 150 / 1076 - train loss : 1.7109979391098022 valid loss [1.6572142]
Epoch 0 Batch 200 / 1076 - train loss : 1.3446513414382935 valid loss [1.3093833]
Epoch 0 Batch 250 / 1076 - train loss : 1.0928196907043457 valid loss [1.077041]
Epoch 0 Batch 300 / 1076 - train loss : 0.9008116126060486 valid loss [0.9304392]
Epoch 0 Batch 350 / 1076 - train loss : 0.8312863707542419 valid loss [0.8304022]
Epoch 0 Batch 400 / 1076 - train loss : 0.7913984656333923 valid loss [0.7604654]
Epoch 0 Batch 450 / 1076 - train loss : 0.7378848195075989 valid loss [0.7096643]
Epoch 0 Batch 500 / 1076 - train loss : 0.6618960499763489 valid loss [0.67250735]
Epoch 0 Batch 550 / 1076 - train loss : 0.6739674210548401 valid loss [0.6434116]
Epoch 0 Batch 600 / 1076 - train loss : 0.6489595174789429 valid loss [0.6128733]
Epoch 0 Batch 650 / 

Epoch 4 Batch 800 / 1076 - train loss : 0.04452865943312645 valid loss [0.031018548]
Epoch 4 Batch 850 / 1076 - train loss : 0.03118615224957466 valid loss [0.030008055]
Epoch 4 Batch 900 / 1076 - train loss : 0.033441413193941116 valid loss [0.027534194]
Epoch 4 Batch 950 / 1076 - train loss : 0.026513107120990753 valid loss [0.027599463]
Epoch 4 Batch 1000 / 1076 - train loss : 0.035255495458841324 valid loss [0.028992085]
Epoch 4 Batch 1050 / 1076 - train loss : 0.03372083231806755 valid loss [0.026985746]
Epoch 5 Batch 50 / 1076 - train loss : 0.03709790110588074 valid loss [0.025753718]
Epoch 5 Batch 100 / 1076 - train loss : 0.02853628620505333 valid loss [0.02494411]
Epoch 5 Batch 150 / 1076 - train loss : 0.029996275901794434 valid loss [0.02625951]
Epoch 5 Batch 200 / 1076 - train loss : 0.025507405400276184 valid loss [0.024797846]
Epoch 5 Batch 250 / 1076 - train loss : 0.030301712453365326 valid loss [0.023755211]
Epoch 5 Batch 300 / 1076 - train loss : 0.02936396561563015 

Epoch 9 Batch 350 / 1076 - train loss : 0.01210806518793106 valid loss [0.012726715]
Epoch 9 Batch 400 / 1076 - train loss : 0.012467622756958008 valid loss [0.012413729]
Epoch 9 Batch 450 / 1076 - train loss : 0.011604207567870617 valid loss [0.012991531]
Epoch 9 Batch 500 / 1076 - train loss : 0.012234819121658802 valid loss [0.014304447]
Epoch 9 Batch 550 / 1076 - train loss : 0.01606171578168869 valid loss [0.0139122475]
Epoch 9 Batch 600 / 1076 - train loss : 0.01661423221230507 valid loss [0.013314065]
Epoch 9 Batch 650 / 1076 - train loss : 0.010615145787596703 valid loss [0.011320461]
Epoch 9 Batch 700 / 1076 - train loss : 0.00906676147133112 valid loss [0.0116267875]
Epoch 9 Batch 750 / 1076 - train loss : 0.0097084054723382 valid loss [0.013924808]
Epoch 9 Batch 800 / 1076 - train loss : 0.014812331646680832 valid loss [0.011212528]
Epoch 9 Batch 850 / 1076 - train loss : 0.009415068663656712 valid loss [0.012150572]
Epoch 9 Batch 900 / 1076 - train loss : 0.0140612982213497

'model/checkpoints'

In [96]:
cd ..

/Users/lvzcl/Desktop/ml_dl/rnn/seq2seq_translation


In [86]:
def sequence_to_seq(sentence, source_vocab_to_int):
    unk_idx = source_vocab_to_int["<UNK>"]
    word_idx = [source_vocab_to_int.get(word, unk_idx) for word in sentence.lower().split()]
    return word_idx

In [104]:
translate_sentence_text = 'i dislike grapefruit , lemons , and peaches .'
target_text = valid_target[0]
translate_sentence = sequence_to_seq(translate_sentence_text, source_word_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph = loaded_graph) as sess:
    loader = tf.train.import_meta_graph('model/checkpoints.meta')
    loader.restore(sess, tf.train.latest_checkpoint('model/'))
    
    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    
    translate_logits = sess.run(logits, feed_dict={
        input_data:[translate_sentence] * batch_size,
        target_sequence_length:[len(translate_sentence)*2]*batch_size,
        source_sequence_length:[len(translate_sentence)*2]*batch_size
    })[0]

print('【Input】')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_word[i] for i in translate_sentence]))

print('\n【Prediction】')
print('  Word Ids:      {}'.format([i for i in translate_logits]))
print('  French Words: {}'.format([target_int_to_word[i] for i in translate_logits]))

print("\n【Full Sentence】")
print(" ".join([target_int_to_word[i] for i in translate_logits]))

INFO:tensorflow:Restoring parameters from model/checkpoints
【Input】
  Word Ids:      [98, 68, 43, 166, 24, 166, 156, 141, 160]
  English Words: ['i', 'dislike', 'grapefruit', ',', 'lemons', ',', 'and', 'peaches', '.']

【Prediction】
  Word Ids:      [239, 54, 133, 233, 299, 52, 352, 57, 357]
  French Words: ['vous', "n'aimez", 'pamplemousses', ',', 'et', 'les', 'citrons', '.', '<EOS>']

【Full Sentence】
vous n'aimez pamplemousses , et les citrons . <EOS>
