### 数据加载

In [44]:
import tensorflow as tf
import pandas as pd
import time

In [45]:
with open('data/letters_source.txt', 'r', encoding='utf-8') as f:
    source_data = f.read()
    
with open('data/letters_target.txt', 'r', encoding='utf-8') as f:
    target_data = f.read()
    

In [46]:
# 数据预览
source_data.split('\n')[:10]

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 'kmclq']

In [47]:
target_data.split("\n")[:10]

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 'cklmq']

### 数据预处理


In [48]:
special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']
set_words = list(set([character for line in source_data.split('\n') for character in line]))   #character --> line—> source_data.split
idx_to_vocab = {idx:word for idx,word in enumerate(special_words + set_words)}    # set list for vocab
vocab_to_int = {word: idx for idx, word in idx_to_vocab.items()}
# idx_to_vocab.items()
# source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
# source_int = [[vocab_to_int.get(letter,vocab_to_int['<UNK>']) 
#                for letter in line] for line in source_data.split('\n')]
source_int = [[vocab_to_int.get(letter,vocab_to_int["<UNK>"]) for letter in line ]for line in source_data.split("\n")]
target_int = [[vocab_to_int.get(letter,vocab_to_int["<UNK>"])for letter in line] + [vocab_to_int["<EOS>"]] for line in target_data.split("\n")]

target_int
# source_int

[[28, 29, 16, 16, 23, 3],
 [12, 11, 26, 3],
 [29, 22, 17, 9, 25, 3],
 [29, 16, 13, 3],
 [28, 18, 21, 17, 3],
 [28, 27, 27, 7, 5, 3],
 [27, 24, 4, 22, 11, 11, 15, 3],
 [12, 11, 23, 13, 3],
 [19, 17, 10, 9, 6, 3],
 [20, 21, 17, 7, 16, 3],
 [24, 18, 14, 3],
 [13, 3],
 [17, 11, 3],
 [28, 29, 24, 12, 3],
 [28, 20, 21, 17, 3],
 [28, 19, 19, 5, 3],
 [8, 3],
 [21, 23, 23, 5, 6, 6, 3],
 [17, 16, 13, 3],
 [24, 21, 12, 16, 5, 6, 3],
 [24, 4, 22, 11, 6, 3],
 [4, 17, 3],
 [29, 21, 23, 5, 26, 3],
 [8, 21, 13, 3],
 [27, 3],
 [19, 19, 18, 15, 3],
 [19, 21, 21, 25, 3],
 [22, 25, 3],
 [20, 20, 17, 16, 3],
 [22, 17, 17, 12, 10, 15, 3],
 [8, 12, 16, 5, 3],
 [28, 3],
 [4, 14, 25, 3],
 [29, 11, 16, 14, 25, 3],
 [20, 27, 17, 10, 5, 13, 26, 3],
 [24, 24, 10, 14, 25, 26, 3],
 [27, 19, 17, 3],
 [24, 19, 21, 17, 12, 6, 6, 3],
 [11, 3],
 [27, 21, 17, 25, 3],
 [11, 3],
 [29, 3],
 [20, 18, 21, 3],
 [28, 10, 11, 23, 5, 5, 3],
 [20, 19, 19, 22, 22, 12, 3],
 [20, 18, 17, 12, 10, 16, 6, 3],
 [7, 12, 9, 3],
 [28, 18, 18

In [49]:
def extract_character_vocab(data):
    """构造映射表，将文本中的字符映射为id"""
    """
    param: data，待要转化的文本输入。列表类型
    return：将文本进行转化后的字典
    """
    # 添加特殊字符
    special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']
    # 通过两个for语句做一个列表生成式
    set_words = list(set([character for line in data.split('\n') for character in line]))
    # 这里要把四个特殊字符添加进词典,通过键值对将word 与idx 组成词典
    int_to_vocab = {idx: word for idx, word in enumerate(special_words + set_words)}
    # idx_to_vocab.items()也就是获取字典中的键值对key:values
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
    return int_to_vocab, vocab_to_int


In [50]:
# 构造映射表
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

# 对字母进行转换
# 获取source数据中（letter）对应id数值，如果该该数据在字典（source_letter_to_int）中没有，则返回"UNK"对应的数字，以该数值作为填充。
source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) 
               for letter in line] for line in source_data.split('\n')]
# target中为句子末尾添加“EOS”终止符
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) 
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')] 

In [51]:
# 查看结果
source_data[:10]  # 转化之前的
source_int[:10]  # 转化之后的
target_int[:10]

[[28, 29, 16, 16, 23, 3],
 [12, 11, 26, 3],
 [29, 22, 17, 9, 25, 3],
 [29, 16, 13, 3],
 [28, 18, 21, 17, 3],
 [28, 27, 27, 7, 5, 3],
 [27, 24, 4, 22, 11, 11, 15, 3],
 [12, 11, 23, 13, 3],
 [19, 17, 10, 9, 6, 3],
 [20, 21, 17, 7, 16, 3]]

### 构建模型

#### 输入层

In [52]:
def get_inputs():
    """
    输出tensor
    """
    inputs = tf.placeholder(tf.int32,[None,None],name="inputs")
    targets = tf.placeholder(tf.int32,[None,None],name= "targets")
    learning_rate = tf.placeholder(tf.float32,(None,),name = "learing_rate")
    
    # 定义序列长度
    target_sequence_length = tf.placeholder(tf.int32,(None,),name="target_sequence_length")
    max_source_sequence_length = tf.reduce_max(source_sequence_length,"max_target_length")
    source_sequence_length = tf.placeholder(tf.int32,(None,),name="source_sequence_length")
    
    return inputs,targets,learning_rate,target_sequence_length,max_source_sequence_length,source_sequence_length

    

#### Encoder

在Encoder端，我们需要进行两步，第一步是将我们输入的文本数据进行word embedding操作，也就是文本数据的向量化操作。第二步是将embedding后的向量特征传入RNN

    embedding所对应的API接口 使用tf.contrib.layers.embed_sequence，它会对每个batch执行embedding操作。
    https://www.tensorflow.org/api_docs/python/tf/contrib/layers/embed_sequence

In [61]:
def get_encoder_layer(input_data,run_size,num_layers,sourc_sequence_length,source_dict_length,encoding_embedding_size):
    """
    para:
    - input_data: 输入tensor
    - rnn_size:rnn隐藏层节点数
    - num_layers: 堆叠rnn的数量
    - source_sequence_length：源数据的序列长度
    - source_dict_length源数据的词典大小
    - encoding_embedding_size:embedding的大小
    """
    # Encoder embedding
    encoder_embedded_input = tf.contrib.layers.embed_sequence(input_data,source_vocab_size,encoding_embedding_size)
    
# RNNcell
    # 定义每个小的lstm单元
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contirb.rnn.LSTMcell(rnn_size,initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    # 搭建多层RNN
    cell = tf.contirb.layers.rnn.MultiRNNCell([get_lstm_cell[rnn_size] for _ in range(num_layers)])
    # 动态RNN输出
    tf.nn.dynamic_rnn(cell,encoder_embedded_input,sque)
    