In [31]:
import numpy as np
import tensorflow as tf
import tqdm

In [2]:
with open('./data/small_vocab_en', 'r', encoding='utf-8') as f:
    source_text = f.read()
with open('./data/small_vocab_fr', 'r', encoding='utf-8') as f:
    target_text = f.read()

In [12]:
view_sentence_range = (0,10)
print('Data stats')
print('Rough the number of unique words: {}'.format(len({word:None for word in source_text.split(' ')})))

print('-'*10 + 'English text' + '-'*10)
sentences = source_text.split('\n')
word_counts = [len(sentence.split(' ')) for sentence in sentences]
print('number of sentences is: {}'.format(len(sentences)))
print('Average number of word in sentence is: {}'.format(np.average(word_counts)))
print('Max number of word in sentence is {}'.format(np.max(word_counts)))

print('-'*10 + 'French text' + '-'*10)
sentences = target_text.split('\n')
words_counts = [len(sentence) for sentence in sentences]
print('number of sentences is {}'.format(len(sentences)))
print('Average number of word in sentence is {}'.format(np.average(word_counts)))
print('Max number of word in sentence is {}'.format(np.max(word_counts)))

print('English sentences from {} to {}'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0] : view_sentence_range[1]]))

print('French sentences from {} to {}'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0] : view_sentence_range[1]]))



Data stats
Rough the number of unique words: 852
----------English text----------
number of sentences is: 137861
Average number of word in sentence is: 13.225589543090503
Max number of word in sentence is 17
----------French text----------
number of sentences is 137861
Average number of word in sentence is 13.225589543090503
Max number of word in sentence is 17
English sentences from 0 to 10
new jersey is sometimes quiet during autumn , and it is snowy in april .
the united states is usually chilly during july , and it is usually freezing in november .
california is usually quiet during march , and it is usually hot in june .
the united states is sometimes mild during june , and it is cold in september .
your least liked fruit is the grape , but my least liked is the apple .
his favorite fruit is the orange , but my favorite is the grape .
paris is relaxing during december , but it is usually chilly in july .
new jersey is busy during spring , and it is never hot in march .
our least l

In [26]:
#之后需要构造映射
#特殊字符
source_codes = ['<PAD>', '<UNK>']
target_codes = ['<GO>', '<PAD>', '<UNK>', '<EOS>']


In [21]:
source_vocab = list(set(source_text.lower().split()))
target_vocab = list(set(target_text.lower().split()))

In [22]:
print('number of words in source vocab : {}'.format(len(source_vocab)))
print('number of words in target vocab : {}'.format(len(target_vocab)))

number of words in source vocab : 227
number of words in target vocab : 354


In [44]:
#构造字典
source_vocab_to_int = {word : idx for idx, word in enumerate(source_codes + source_vocab)}
source_int_to_vocab = {idx : word for idx, word in enumerate(source_codes + source_vocab)}

target_vocab_to_int = {word : idx for idx, word in enumerate(target_codes + target_vocab)}
target_int_to_vocab = {idx : word for idx, word in enumerate(target_codes + target_vocab)}

In [45]:
print('The size of English Map is: {}'.format(len(source_vocab_to_int)))
print('The size of French Map is : {}'.format(len(target_vocab_to_int)))

The size of English Map is: 229
The size of French Map is : 358


In [62]:
def text_to_int(sentence, map_dict, max_length=20, is_target=False):
    text_to_idx = []
    
    unk_to_idx = map_dict.get('<UNK>')
    pad_to_idx = map_dict.get('<PAD>')
    eos_to_idx = map_dict.get('<EOS>')
    
    #如果是输入文本，直接将句子中的单词的idx加入list，如果不是，则在后面添加EOS
    if not is_target:
        for word in sentence.lower().split():
            text_to_idx.append(map_dict.get(word, unk_to_idx))
    else:
        for word in sentence.lower().split():
            text_to_idx.append(map_dict.get(word, unk_to_idx))
        text_to_idx.append(eos_to_idx)
    #超出长度需要截断，长度不够，进行填充
    if len(text_to_idx) > max_length:
        text_to_idx = text_to_idx[:max_length]
    else:
        text_to_idx = text_to_idx + [pad_to_idx] * (max_length - len(text_to_idx))
    return text_to_idx

In [63]:
source_text_to_int = []
for sentence in tqdm.tqdm(source_text.split('\n')):
    source_text_to_int.append(text_to_int(sentence, source_vocab_to_int, 20))

100%|██████████| 137861/137861 [00:01<00:00, 114051.30it/s]


In [64]:
target_text_to_int = []
for sentence in tqdm.tqdm(target_text.split('\n')):
    target_text_to_int.append(text_to_int(sentence, target_vocab_to_int, 25, is_target=True))

100%|██████████| 137861/137861 [00:01<00:00, 100096.71it/s]


In [65]:
random_index= 77
print('-'*5 + 'English' + '-'*5)
print(source_text.split('\n')[random_index])
print(source_text_to_int[random_index])

print('-'*5 + 'French' + '-'*5)
print(target_text.split('\n')[random_index])
print(target_text_to_int[random_index])

-----English-----
the united states is never beautiful during march , and it is usually relaxing in summer .
[170, 135, 63, 108, 62, 49, 33, 18, 145, 15, 12, 108, 168, 80, 165, 159, 160, 0, 0, 0]
-----French-----
les états-unis est jamais belle en mars , et il est relaxant habituellement en été .
[306, 85, 91, 210, 47, 151, 28, 156, 197, 177, 91, 51, 201, 151, 147, 66, 3, 1, 1, 1, 1, 1, 1, 1, 1]


In [112]:
def get_inputs():
    #获取输入参数，分别为inputs, targets, learning_rate, max_target_length, source_sentence_length, target_sentence_length
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    source_sequence_length = tf.placeholder(tf.int32, [None,], name='source_sequence_length')
    target_sequence_length = tf.placeholder(tf.int32, [None,], name='target_sequence_length')
    max_target_length = tf.placeholder(tf.int32, [None,], name='max_target_length')
    
    return inputs, targets, learning_rate, source_sequence_length, target_sequence_length, max_target_length


In [113]:
def encoder_layer(inputs, rnn_size, num_layers, source_sequence_length, source_vocab_size,encoder_embedding_size=100):
    encoder_embed = tf.contrib.layers.embed_sequence(inputs, source_vocab_size, encoder_embedding_size)
    
    def get_lstm_cell(rnn_size):
        return tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
    lstms = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
    encoder_output, encoder_state = tf.nn.dynamic_rnn(lstms, encoder_embed, source_sequence_length, dtype=tf.float32)
    return encoder_output, encoder_state

In [114]:
def decoder_layer_input(target_data, target_vocab_to_int, batch_size):
    #删除最后一个字符，在第一个字符位置添加'<GO>'
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1,1])
    decoder_inputs = tf.concat([tf.fill([batch_size, 1], target_vocab_to_int["<GO>"]), ending], 1)
    return decoder_inputs

In [115]:
def decoder_layer_train(encoder_states, decoder_cell, decoder_embed, 
                        target_sequence_length, max_target_length, output_layer):
    """
    Decoder端的训练
    
    @param encoder_states: Encoder端编码得到的Context Vector
    @param decoder_cell: Decoder端
    @param decoder_embed: Decoder端词向量嵌入后的输入
    @param target_sequence_len: 法语文本的长度
    @param max_target_sequence_len: 法语文本的最大长度
    @param output_layer: 输出层
    """
    training_helper = tf.contrib.seq2seq.TrainingHelper(decoder_embed, target_sequence_length, time_major=False)
    
    training_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, training_helper, encoder_states, output_layer)
    
    training_outputs, _,  _ = tf.contrib.seq2seq.dynamic_decode(
        training_decoder, impute_finished=True, maximum_iterations=max_target_length)
    
    return training_outputs

In [116]:
def decoder_layer_infer(encoder_states, decoder_cell, decoder_embed, 
                        start_id, end_id, max_target_length, output_layer, batch_size):
    """
    Decoder端的预测/推断
    
    @param encoder_states: Encoder端编码得到的Context Vector
    @param decoder_cell: Decoder端
    @param decoder_embed: Decoder端词向量嵌入后的输入
    @param start_id: 句子起始单词的token id， 即"<GO>"的编码
    @param end_id: 句子结束的token id，即"<EOS>"的编码
    @param max_target_sequence_len: 法语文本的最大长度
    @param output_layer: 输出层
    @batch_size: batch size
    """
    start_tokens = tf.tile(tf.constant([start_id],dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embed, start_tokens, end_id)
    
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, inference_helper, encoder_states, output_layer)
    
    inference_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        inference_decoder, impute_finished=True, maximum_iterations=max_target_length)
    
    return inference_outputs

In [117]:
def decoder_layer(encoder_states, decoder_inputs, rnn_size, num_layers,
                 target_sequence_length, max_target_length, target_vocab_to_int,
                  target_vocab_size, decoder_embedding_size, batch_size):
    decoder_embedding = tf.Variable(tf.random_uniform([target_vocab_size, decoder_embedding_size]))
    decoder_embed = tf.nn.embedding_lookup(decoder_embedding, decoder_inputs)
    
    def get_lstm(rnn_size):
        return tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=456))
    
    decoder_cell = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size) for _ in range(num_layers)])
    
    output_layer = tf.layers.Dense(target_vocab_size)
    
    with tf.variable_scope('decoder'):
        training_logits = decoder_layer_train(encoder_states, decoder_cell, 
                                              decoder_embed, target_sequence_length, max_target_length, output_layer)
        
    with tf.variable_scope('decoder', reuse=True):
        inference_logits = decoder_layer_infer(encoder_states, decoder_cell, decoder_embedding,
                                               target_vocab_to_int['<GO>'], target_vocab_to_int['<EOS>'],
                                               max_target_length, output_layer, batch_size)
    return training_logits, inference_logits

In [118]:
def seq2seq_model(input_data, target_data, batch_size, 
                  source_sequence_length, target_sequence_length, max_target_length,
                  source_vocab_size, target_vocab_size, encoder_embedding_size, decoder_embedding_size, 
                  rnn_size, num_layers, target_vocab_to_int):
    encoder_output, encoder_state = encoder_layer(inputs, rnn_size, num_layers, source_sequence_length, 
                                                 source_vocab_size, encoder_embedding_size)
    decoder_inputs = decoder_layer_input(target_data, target_vocab_to_int, batch_size)
    
    training_logits, inference_logits = decoder_layer(encoder_state, decoder_inputs, rnn_size, num_layers,
                                                     target_sequence_length, max_target_length, target_vocab_to_int,
                                                     target_vocab_size, encoder_embedding_size, batch_size)
    
    return training_logits, inference_logits

In [119]:
epochs = 10
batch_size = 128

rnn_size = 128
num_layers = 1
encoder_embedding_size = 100
decoder_embedding_size = 100
lr = 0.001
display_step = 50

In [120]:
train_graph = tf.Graph()

with tf.Session(graph=train_graph) as sess:
    
    inputs, targets, learning_rate, source_sequence_length, target_sequence_length, max_target_length = get_inputs()
    max_target_length = 25 
    training_logits, inference_logits = seq2seq_model(inputs, targets, batch_size, source_sequence_length,
                                                      target_sequence_length, max_target_length,
                                                      len(source_vocab_to_int), len(target_vocab_to_int), 
                                                      encoder_embedding_size, decoder_embedding_size, rnn_size,
                                                      num_layers, target_vocab_to_int)
    
    training_logits = tf.identity(training_logits.rnn_output, name='logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_length, dtype=tf.float32, name='mask')
    
    with tf.name_scope('optimization'):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
        
        optimizer = tf.train.AdamOptimizer(lr)
        gradients = optimizer.compute_gradients(cost)
        clipped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
        training_op = optimizer.apply_gradients(clipped_gradients)

In [121]:
def get_batch(sources, targets, batch_size):
    for batch_i in range(0, len(sources) // batch_size):
        start_i = batch_i * batch_size
        
        source_batch = sources[start_i : start_i + batch_size]
        target_batch = targets[start_i : start_i + batch_size]
        
        target_length = []
        for target in target_batch:
            target_length.append(len(target))
            
        source_length = []
        for source in source_batch:
            source_length.append(len(source))
        yield source_batch, target_batch, source_length, target_length

In [126]:
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(epochs):
        for batch_i ,(source_batch, target_batch, source_length, target_length) in enumerate(
            get_batch(source_text_to_int, target_text_to_int, batch_size)):
            
            _, loss = sess.run([training_op, cost], feed_dict={
                inputs:source_batch,
                targets:target_batch, 
                learning_rate:lr,
                source_sequence_length: source_length,
                target_sequence_length: target_length,
            })
            
            if batch_i % display_step == 0 and batch_i > 0:


                batch_train_logits = sess.run(
                    inference_logits,
                    {inputs: source_batch,
                     source_sequence_length: source_length,
                     target_sequence_length: target_length})

                print('Epoch {:>3} Batch {:>4}/{} - Loss: {:>6.4f}'
                      .format(epoch_i, batch_i, len(source_text_to_int) // batch_size, loss))
    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, "checkpoints/dev")
    print('Model Trained and Saved')

Epoch   0 Batch   50/1077 - Loss: 2.5459
Epoch   0 Batch  100/1077 - Loss: 2.1668
Epoch   0 Batch  150/1077 - Loss: 1.7511
Epoch   0 Batch  200/1077 - Loss: 1.3374
Epoch   0 Batch  250/1077 - Loss: 1.0660
Epoch   0 Batch  300/1077 - Loss: 0.8806
Epoch   0 Batch  350/1077 - Loss: 0.8053
Epoch   0 Batch  400/1077 - Loss: 0.7636
Epoch   0 Batch  450/1077 - Loss: 0.7136
Epoch   0 Batch  500/1077 - Loss: 0.6349
Epoch   0 Batch  550/1077 - Loss: 0.6558
Epoch   0 Batch  600/1077 - Loss: 0.6260
Epoch   0 Batch  650/1077 - Loss: 0.5812
Epoch   0 Batch  700/1077 - Loss: 0.5408
Epoch   0 Batch  750/1077 - Loss: 0.5455
Epoch   0 Batch  800/1077 - Loss: 0.5466
Epoch   0 Batch  850/1077 - Loss: 0.4790
Epoch   0 Batch  900/1077 - Loss: 0.4970
Epoch   0 Batch  950/1077 - Loss: 0.4538
Epoch   0 Batch 1000/1077 - Loss: 0.4640
Epoch   0 Batch 1050/1077 - Loss: 0.4632
Epoch   1 Batch   50/1077 - Loss: 0.4417
Epoch   1 Batch  100/1077 - Loss: 0.4287
Epoch   1 Batch  150/1077 - Loss: 0.3953
Epoch   1 Batch 

KeyboardInterrupt: 

In [124]:
def sentence_to_seq(sentence, source_vocab_to_int):
    """
    将句子转化为数字编码
    """
    unk_idx = source_vocab_to_int["<UNK>"]
    word_idx = [source_vocab_to_int.get(word, unk_idx) for word in sentence.lower().split()]
    
    return word_idx

In [None]:
translate_sentence_text = input("请输入句子：")

In [None]:
translate_sentence = sentence_to_seq(translate_sentence_text, source_vocab_to_int)

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph('checkpoints/dev.meta')
    loader.restore(sess, tf.train.latest_checkpoint('./checkpoints'))

    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_len:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_len:0')

    translate_logits = sess.run(logits, {input_data: [translate_sentence]*batch_size,
                                         target_sequence_length: [len(translate_sentence)*2]*batch_size,
                                         source_sequence_length: [len(translate_sentence)]*batch_size})[0]

print('【Input】')
print('  Word Ids:      {}'.format([i for i in translate_sentence]))
print('  English Words: {}'.format([source_int_to_vocab[i] for i in translate_sentence]))

print('\n【Prediction】')
print('  Word Ids:      {}'.format([i for i in translate_logits]))
print('  French Words: {}'.format([target_int_to_vocab[i] for i in translate_logits]))

print("\n【Full Sentence】")
print(" ".join([target_int_to_vocab[i] for i in translate_logits]))