In [2]:
import numpy as np
import time
import tensorflow as tf

# from tensorflow.python

with open('F:\project\MachineLearning\DeepLearning\Data\Seq2seq\letters_source.txt', 'r', encoding='utf-8') as f:
    source_data = f.read()

with open('F:\project\MachineLearning\DeepLearning\Data\Seq2seq\letters_target.txt', 'r', encoding='utf-8') as f:
    target_data = f.read()

In [3]:
source_data.split('\n')[:20]

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 'kmclq',
 'ire',
 'v',
 'pl',
 'neab',
 'lcak',
 'htah',
 'f',
 'zskzts',
 'vlq',
 'nzqtek']

In [None]:
def extract_character_vocab(data):
    '''
     构造映射表
    :param data: 
    :return: 
    '''
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    set_words = list(set([character for line in data.split('\n') for character in line]))
    # 将四种特殊字符添加进字典
    int_to_vocab = {idx : word for idx, word in enumerate(special_words + set_words)}
    vocab_to_int = {word : idx for idx, word in int_to_vocab.items()}
    return int_to_vocab, vocab_to_int

In [None]:
# 构造映射表
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

#对字符进行转换
source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>'])
                for letter in line] for  line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>'])
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')]

In [None]:
source_int[:10]

In [None]:
target_int[:10]

In [None]:
def get_inputs():
    '''
    模型输入tensor
    :return: 
    '''
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    # 定义target序列最大长度
    target_sequence_length = tf.placeholder(tf.int32, (None, ), name='target_sequence_length')
    source_sequence_length = tf.placeholder(tf.int32, (None, ), name='source_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_len')
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length

In [None]:
# Encoder
def get_encoder_layer(input_data, rnn_size, num_layers, source_sequence_length, source_vocab_size, encoding_embedding_size):
    '''
     构造encoder层
    :param input_data: 
    :param rnn_size: rnn隐层数量
    :param num_layers: 
    :param source_sequence_length: 源数据的序列长度 
    :param source_vocab_size: 源数据的字典大小
    :param encoding_embedding_size: embedding大小
    :return: 
    '''
    # Encoder embedding
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, encoding_embedding_size)
    
    # RNN cell
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    
    cell = tf.nn.rnn_cell.MultiRNNCell([get_lstm_cell(rnn_size)] for _ in range(num_layers))
    encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input,
                                                      sequence_length=source_sequence_length, dtype=tf.float32)
    return encoder_output, encoder_state

In [None]:
# Decoder
# 对target数据进行处理
def process_decoder_input(data, vocab_to_int, batch_size):
    '''
     补充<GO>，并移除最后一个字符
    :param data: 
    :param vocab_to_int: 
    :param batch_size: 
    :return: 
    '''
    ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1,1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return ending, decoder_input

In [None]:
# 对数据进行embedding
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                   target_sequence_length, max_target_sequence_length, encoder_state, decoder_input):
    '''
     构造decoder层
    :param target_letter_to_int: target数据的映射表
    :param decoding_embedding_size: embed向量大小
    :param num_layers: 堆叠的数量
    :param rnn_size: 
    :param target_sequence_length: 
    :param max_target_sequence_length: 
    :param encoder_state: encoder端编码的状态向量
    :param decoder_input: decoder端输入
    :return: 
    '''
    # 1，Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
    
    # 2,构建decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell
    cell = tf.nn.rnn_cell.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
    
    # 3,获得全连接层
    output_layer = tf.layers.dense(target_vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    # 4, Training decoder
    with tf.variable_scope('decode'):
        # 得到help对象
        training_help = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input, sequence_length=target_sequence_length, 
                                                          time_major=False)
        # 构造decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell, training_help, encoder_state, output_layer)
        training_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                       impute_finished=True,
                                                                       maximum_iteations=max_target_sequence_length)
    # 5, Predicting decoder
    # 与training共享参数
    with tf.variable_scope('decode', reuse=True):
        # 创建一个常量tensor并复制为batch_size的大小
        start_tokens = tf.tile(tf.constant([target_letter_to_int['GO']], dtype=tf.int32), [batch_size], name='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                start_tokens,
                                                                target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                        predicting_helper,
                                                        encoder_state,
                                                        output_layer)
        predicting_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                            impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)
    
    return training_decoder_output, predicting_decoder_output

In [None]:
# 构建seq2seq模型
def seq2seq_model()