In [76]:
import numpy as np 
import pandas as pd
import tensorflow as tf
from tensorflow.python.layers.core import Dense


## 本次实现的seq2seq模型是给一个单词，让他按照单词顺序输出，也就是让其学习单词排序

In [77]:


with open('./data/letters_source.txt', 'r') as f:
    source_data = f.read()
with open('./data/letters_target.txt', 'r') as f: 
    target_data = f.read()

In [78]:
source_data.split('\n')[:10]

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 'kmclq']

In [79]:
target_data.split('\n')[:10]

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 'cklmq']

## 从上面可以看出来，数据都是给处理好的

In [80]:
def extract_character_vocab(data):
    special_word = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    
    set_words = list(set([word for line in data.split('\n') for word in line]))
    
    int_to_vocab = {i:v for i, v in enumerate(set_words + special_word)}
    vocab_to_int = {v:i for i, v in int_to_vocab.items()}
    
    return int_to_vocab, vocab_to_int

In [81]:
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

In [82]:
#对字母进行转换
source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) 
               for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) 
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')]

In [83]:
source_int[:10]

[[1, 11, 8, 23, 23],
 [18, 20, 25],
 [4, 1, 12, 6, 0],
 [1, 23, 17],
 [21, 9, 8, 4],
 [19, 3, 3, 8, 7],
 [22, 3, 24, 20, 0, 20, 16],
 [18, 11, 20, 17],
 [10, 6, 4, 15, 13],
 [21, 7, 5, 4, 23]]

In [118]:
def get_input():
    inputs = tf.placeholder(tf.int32, [None, None] , name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length')
    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_length = tf.reduce_max(target_sequence_length, name='max_target_length')
    return inputs, targets, learning_rate, target_sequence_length, max_target_length, source_sequence_length

In [119]:
def get_encoder_layer(inputs, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoding_embedding_size):
    encoder_embed_input = tf.contrib.layers.embed_sequence(inputs, source_vocab_size, encoding_embedding_size)
    
    def get_lstm(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size) for _ in range(num_layers)])
    
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input, 
                                                      sequence_length=source_sequence_length, dtype=tf.float32)
    return encoder_output, encoder_state

In [120]:
def process_decoder_input(data, vocab_to_int, batch_size):
    ending = tf.strided_slice(data, [0,0], [batch_size, -1], )
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    
    return decoder_input

In [121]:
def decoder_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size, target_sequence_length, 
                  max_target_length, encoder_state, decoder_input):

    '''
    构造Decoder层
    
    参数：
    - target_letter_to_int: target数据的映射表
    - decoding_embedding_size: embed向量大小
    - num_layers: 堆叠的RNN单元数量
    - rnn_size: RNN单元的隐层结点数量
    - target_sequence_length: target数据序列长度
    - max_target_sequence_length: target数据序列最大长度
    - encoder_state: encoder端编码的状态向量
    - decoder_input: decoder端输入
    '''
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)

    #构造decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
    
    output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    
    with tf.variable_scope('decode'):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input, 
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell, training_helper, encoder_state, output_layer)
        
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True, 
                                                                   maximum_iterations= max_target_length)
    
    with tf.variable_scope('decode', reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], 
                              name='start_token')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, 
                                                                  target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell, predicting_helper, encoder_state, output_layer)
        
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder, impute_finished=True, 
                                                                        maximum_iterations= max_target_length)
        
    return training_decoder_output, predicting_decoder_output
    

In [122]:
def seq2seq(inputs, targets, lr, rnn_size, num_layers, source_sequence_length, target_sequence_length, source_vocab_size, 
            target_vocab_size, encoding_embedding_size, decode_embedding_size, max_target_length):
    
    _, encoder_state = get_encoder_layer(inputs, rnn_size, num_layers, 
                                         source_sequence_length, source_vocab_size, encoding_embedding_size)
    decoder_input = process_decoder_input(targets, target_letter_to_int, batch_size)
    
    training_decoder_output, predicting_decoder_output = decoder_layer(target_letter_to_int, decoding_embedding_size, 
                                                                       num_layers, rnn_size, target_sequence_length, 
                                                                       max_target_length, encoder_state, decoder_input)
    return training_decoder_output, predicting_decoder_output

In [123]:
epoches = 60
batch_size = 128
rnn_size = 50
num_layers =2
learning_rate = 0.001
encoding_embedding_size = 15
decoding_embedding_size = 15

In [125]:
train_graph = tf.Graph()

with train_graph.as_default():
    input_data, target_data, lr, target_sequence_length, max_target_length, source_sequence_length = get_input()
    
    training_decoder_output, predicting_decoder_output = seq2seq(input_data, target_data, lr, rnn_size, num_layers, 
                                                                source_sequence_length, target_sequence_length, 
                                                                len(source_letter_to_int), len(target_letter_to_int), 
                                                                encoding_embedding_size, 
                                                                decoding_embedding_size, max_target_length)
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='prediction')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_length, dtype= tf.float32, name='masks')
    
    with tf.name_scope("optimization"):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, target_data, masks)
        optimizer = tf.train.AdamOptimizer(lr)
        
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

In [126]:
def pad_sentence_batch(sentence_batch, pad_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int]*(max_sentence - len(sentence)) for sentence in sentence_batch]

In [127]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    for batch_i in range(len(sources) // batch_size):
        start_i = batch_i * batch_size
        source_batch = sources[start_i: start_i + batch_size]
        target_batch = targets[start_i: start_i + batch_size]
        
        pad_source_batch = np.array(pad_sentence_batch(source_batch, source_pad_int))
        pad_target_batch = np.array(pad_sentence_batch(target_batch, target_pad_int))
        
        target_length = []
        for target in target_batch:
            target_length.append(len(target))
        
        source_length = []
        for source in source_batch:
            source_length.append(len(source))
            
        yield  pad_target_batch, pad_source_batch, target_length, source_length

In [132]:
#留出一个batch_size 作为验证集
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]

valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]

valid_target_batch, valid_source_batch, valid_target_length, valid_source_length = next(get_batches(
    valid_target, valid_source, batch_size, source_letter_to_int['<PAD>'], target_letter_to_int['<PAD>']))

In [138]:
display_step = 50
checkpoint = './train_model.ckpt'

with tf.Session(graph = train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1, epoches+1):
        for batch_i, (target_batch, source_batch, target_length, source_length) in enumerate(get_batches(
            train_target, train_source, batch_size, source_letter_to_int['<PAD>'], target_letter_to_int['<PAD>'])):
            _, loss = sess.run([train_op, cost], feed_dict={
                input_data:source_batch,
                target_data:target_batch,
                lr:learning_rate,
                target_sequence_length:target_length,
                source_sequence_length:source_length,
            })
            if (batch_i+1) % display_step == 0:
                valid_loss = sess.run([cost], feed_dict={
                    input_data:valid_source_batch,
                    target_data:valid_target_batch,
                    lr:learning_rate,
                    target_sequence_length:valid_target_length,
                    source_sequence_length:valid_source_length,
                })
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epoches, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              valid_loss[0]))
                
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model train and saved')

Epoch   1/60 Batch   49/77 - Training Loss:  2.939  - Validation loss:  2.936
Epoch   2/60 Batch   49/77 - Training Loss:  2.202  - Validation loss:  2.189
Epoch   3/60 Batch   49/77 - Training Loss:  1.802  - Validation loss:  1.764
Epoch   4/60 Batch   49/77 - Training Loss:  1.493  - Validation loss:  1.469
Epoch   5/60 Batch   49/77 - Training Loss:  1.268  - Validation loss:  1.245
Epoch   6/60 Batch   49/77 - Training Loss:  1.092  - Validation loss:  1.067
Epoch   7/60 Batch   49/77 - Training Loss:  0.924  - Validation loss:  0.908
Epoch   8/60 Batch   49/77 - Training Loss:  0.776  - Validation loss:  0.757
Epoch   9/60 Batch   49/77 - Training Loss:  0.660  - Validation loss:  0.637
Epoch  10/60 Batch   49/77 - Training Loss:  0.566  - Validation loss:  0.542
Epoch  11/60 Batch   49/77 - Training Loss:  0.484  - Validation loss:  0.469
Epoch  12/60 Batch   49/77 - Training Loss:  0.410  - Validation loss:  0.399
Epoch  13/60 Batch   49/77 - Training Loss:  0.351  - Validation

In [139]:
def source_to_seq(text):
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text] + [
        source_letter_to_int['<PAD>']]*(sequence_length - len(text))

In [142]:
input_word = 'common'
text = source_to_seq(input_word)

checkpoint = './train_model.ckpt'
loaded_graph = tf.Graph()

with tf.Session(graph = loaded_graph) as  sess:
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('prediction:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(input_word)]*batch_size, 
                                      source_sequence_length: [len(input_word)]*batch_size})[0] 


pad = source_letter_to_int["<PAD>"] 

print('原始输入:', input_word)

print('\nSource')
print('  Word 编号:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))

print('\nTarget')
print('  Word 编号:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./train_model.ckpt
原始输入: common

Source
  Word 编号:    [5, 15, 7, 7, 15, 18, 26]
  Input Words: c o m m o n <PAD>

Target
  Word 编号:       [5, 7, 7, 18, 15, 15]
  Response Words: c m m n o o
