In [11]:
import tensorflow as tf
import numpy as np
import warnings
import matplotlib.pyplot as plt
import tqdm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
warnings.filterwarnings('ignore')

In [4]:
with open('../data/letters_source.txt', 'r') as f:
    source_data = f.read()
f.close()
with open('../data/letters_target.txt', 'r') as f:
    target_data = f.read()
f.close()

In [7]:
source_data = source_data.split('\n')
target_data = target_data.split('\n')

In [12]:
source_data[:10]
target_data[:10]
len(source_data)
len(target_data)

['bsaqq',
 'npy',
 'lbwuj',
 'bqv',
 'kial',
 'tddam',
 'edxpjpg',
 'nspv',
 'huloz',
 'kmclq']

['abqqs',
 'npy',
 'bjluw',
 'bqv',
 'aikl',
 'addmt',
 'degjppx',
 'npsv',
 'hlouz',
 'cklmq']

10000

10000

In [20]:
def character_dict(data):
    #构造映射表
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    set_words = list(set([character for line in data for character in line]))
    int_to_character = {idx: character for idx, character in enumerate(set_words + special_words)}
    character_to_int = {character: idx for idx, character in int_to_character.items()}
    return character_to_int, int_to_character

In [33]:
source_character_to_int, source_int_to_character = character_dict(source_data)
target_character_to_int, target_int_to_character = character_dict(target_data)
#构造输入数据,将字母用数字表示
source_int = [[source_character_to_int.get(character, '<UNK>') for character in line] for line in source_data]
target_int = [[target_character_to_int.get(character, '<UNK>') for character in line] + [
    target_character_to_int['<EOS>']] for line in source_data]

In [81]:
def get_input():
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')

    target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name='max_target_sequnence_length')
    source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_target_length')
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length

In [82]:
def Encoder(input_data, rnn_size, layer_nums, 
            source_sequence_length, source_character_size, encoding_embedding_size):
    encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_character_size, encoding_embedding_size)
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    multi_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(layer_nums)])
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(multi_cell, encoder_embed_input, source_sequence_length, 
                                                       dtype=tf.float32)
    return encoder_outputs, encoder_state

In [83]:
def process_decoder_input(data, character_to_int, batch_size):
    #去除掉target最后的<eos>，并且在target的前面加上<go>作为起始标志
    ending = tf.strided_slice(data, [0,0], [batch_size, -1], [1,1])
    decoder_inputs = tf.concat([tf.fill([batch_size, 1], character_to_int['<GO>']), ending], 1)
    return decoder_inputs

In [84]:
def decoder(decoder_input, rnn_size, layer_nums, decoding_embedding_size, target_character_to_int, 
            target_sequence_length, max_target_sequence_length, encoder_state):
    #1.embedding
    target_character_size = len(target_character_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_character_size, decoding_embedding_size]))
    decoder_embedding_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
    #2.构造decoder端的rnn单元
    def get_lstm_cell(rnn_size):
        lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return lstm_cell
    multi_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(layer_nums)])
    #构造全连接层
    output_layer = tf.layers.Dense(target_character_size, kernel_initializer=tf.truncated_normal_initializer(
        mean=0.0, stddev=0.1))
    #4.training_decoder
    with tf.variable_scope('decoder'):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embedding_input,
                                                            sequence_length=target_sequence_length,
                                                            time_major=False)
        #构造decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(multi_cell, training_helper, encoder_state, output_layer)
        
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
    
    #5.predict_decoder
    with tf.variable_scope('decoder', reuse=True):
        start_tokens = tf.tile(tf.constant([target_character_to_int['<GO>']], dtype=tf.int32), [batch_size], 
                               name='start_tokens')
        predict_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, 
                                                                  target_character_to_int['<EOS>'])
        predict_decoder = tf.contrib.seq2seq.BasicDecoder(multi_cell, predict_helper, encoder_state, output_layer)
        
        predict_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predict_decoder, 
                                                                      impute_finished=True, 
                                                                      maximum_iterations=max_target_sequence_length)
        
    return training_decoder_output, predict_decoder_output


In [85]:
def seq2seq_model(input_data, targets, lr, target_sequence_length, max_target_sequence_length,
                  source_sequence_length, source_character_size, target_character_size, 
                  encoding_embedding_size, decoding_embedding_size, rnn_size, layer_nums):
    encoder_ouput, encoder_state = Encoder(input_data, rnn_size, layer_nums, 
                                          source_sequence_length, source_character_size, encoding_embedding_size)
    decoder_input = process_decoder_input(targets, target_character_to_int, batch_size)
    training_decoder_output, predict_decoder_output = decoder(decoder_input, rnn_size, layer_nums,
                                                             decoding_embedding_size, target_character_to_int, 
                                                             target_sequence_length, max_target_sequence_length,
                                                             encoder_state)
    return training_decoder_output, predict_decoder_output

In [86]:
epochs = 60
learning_rate = 0.001
batch_size = 128
rnn_size = 50
layer_nums = 2
encoding_embedding_size = 15
decoding_embedding_size = 15


In [92]:
training_graph = tf.Graph()
with training_graph.as_default():
    inputs_data, targets, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_input()
    training_decoder_output, predict_decoder_output = seq2seq_model(inputs_data, targets, lr, target_sequence_length, 
                                                            max_target_sequence_length, source_sequence_length,
                                                            len(source_character_to_int), len(target_character_to_int),
                                                            encoding_embedding_size, decoding_embedding_size, rnn_size,
                                                                   layer_nums)
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predict_logits = tf.identity(predict_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
    with tf.name_scope('optimization'):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
        optimizer = tf.train.AdamOptimizer(lr)
        
        gradients = optimizer.compute_gradients(cost)
        crapped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        training_op = optimizer.apply_gradients(crapped_gradients)
    

In [93]:
def pad_sentence_batch(sentence_batch, pad_int):
    #对sentence进行补全，不够的填充<PAD>
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) 
            for sentence in sentence_batch]

In [94]:
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    for batch_i in range(len(sources) // batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i : start_i + batch_size]
        targets_batch = targets[start_i : start_i + batch_size]
        pad_sources_batch = pad_sentence_batch(sources_batch, source_pad_int)
        pad_targets_batch = pad_sentence_batch(targets_batch, target_pad_int)
        
        #记录每条记录的长度
        target_lengths = []
        for target in targets_batch:
            target_lengths.append(len(target))
        
        source_lengths = []
        for source in sources_batch:
            source_lengths.append(len(source))
        
        yield pad_targets_batch, pad_sources_batch, target_lengths, source_lengths
        

In [102]:
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]

valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(
    get_batches(valid_target, valid_source, batch_size,source_character_to_int['<PAD>'],
                target_character_to_int['<PAD>']))
display_step = 50
checkpoint = '../model/train_model.ckpt'
with tf.Session(graph=training_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1, epochs+1):
        for batch_i, (target_batch, source_batch, target_length, source_length) in enumerate(
            1, get_batches(train_target, train_source, batch_size, source_character_to_int['<PAD>'], 
                        target_character_to_int['<PAD>'])):
            _, loss = sess.run([training_op, cost], feed_dict={
                inputs_data:source_batch,
                targets:target_batch,
                lr:learning_rate,
                target_sequence_length:target_length,
                source_sequence_length:source_length,
            })
            if batch_i % display_step == 0:
                valid_loss = sess.run([cost], feed_dict={
                    inputs_data:valid_sources_batch,
                    targets:valid_targets_batch,
                    lr:learning_rate,
                    target_sequence_length:valid_targets_lengths,
                    source_sequence_length:valid_sources_lengths
                })
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              loss, 
                              valid_loss[0]))
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    
    

Epoch   1/60 Batch    0/77 - Training Loss:  3.400  - Validation loss:  3.396
Epoch   1/60 Batch   50/77 - Training Loss:  3.088  - Validation loss:  3.054
Epoch   2/60 Batch    0/77 - Training Loss:  2.865  - Validation loss:  2.827
Epoch   2/60 Batch   50/77 - Training Loss:  2.548  - Validation loss:  2.479
Epoch   3/60 Batch    0/77 - Training Loss:  2.382  - Validation loss:  2.349
Epoch   3/60 Batch   50/77 - Training Loss:  2.190  - Validation loss:  2.133
Epoch   4/60 Batch    0/77 - Training Loss:  2.075  - Validation loss:  2.039
Epoch   4/60 Batch   50/77 - Training Loss:  1.909  - Validation loss:  1.863
Epoch   5/60 Batch    0/77 - Training Loss:  1.804  - Validation loss:  1.776
Epoch   5/60 Batch   50/77 - Training Loss:  1.678  - Validation loss:  1.617
Epoch   6/60 Batch    0/77 - Training Loss:  1.551  - Validation loss:  1.536
Epoch   6/60 Batch   50/77 - Training Loss:  1.449  - Validation loss:  1.377
Epoch   7/60 Batch    0/77 - Training Loss:  1.330  - Validation

Epoch  54/60 Batch    0/77 - Training Loss:  0.012  - Validation loss:  0.028
Epoch  54/60 Batch   50/77 - Training Loss:  0.013  - Validation loss:  0.031
Epoch  55/60 Batch    0/77 - Training Loss:  0.010  - Validation loss:  0.024
Epoch  55/60 Batch   50/77 - Training Loss:  0.024  - Validation loss:  0.029
Epoch  56/60 Batch    0/77 - Training Loss:  0.011  - Validation loss:  0.040
Epoch  56/60 Batch   50/77 - Training Loss:  0.012  - Validation loss:  0.033
Epoch  57/60 Batch    0/77 - Training Loss:  0.008  - Validation loss:  0.024
Epoch  57/60 Batch   50/77 - Training Loss:  0.015  - Validation loss:  0.029
Epoch  58/60 Batch    0/77 - Training Loss:  0.012  - Validation loss:  0.026
Epoch  58/60 Batch   50/77 - Training Loss:  0.010  - Validation loss:  0.021
Epoch  59/60 Batch    0/77 - Training Loss:  0.008  - Validation loss:  0.023
Epoch  59/60 Batch   50/77 - Training Loss:  0.011  - Validation loss:  0.024
Epoch  60/60 Batch    0/77 - Training Loss:  0.006  - Validation

'../model/train_model.ckpt'

In [105]:
def source_to_seq(text):
    '''
    对源数据进行转换
    '''
    sequence_length = 7
    return [source_character_to_int.get(word, source_character_to_int['<UNK>']) for word in text] + [
        source_character_to_int['<PAD>']]*(sequence_length-len(text))

In [106]:
# 输入一个单词
input_word = 'common'
text = source_to_seq(input_word)

checkpoint = "../model/trained_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # 加载模型
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_sequence_length: [len(input_word)]*batch_size, 
                                      source_sequence_length: [len(input_word)]*batch_size})[0] 


pad = source_letter_to_int["<PAD>"] 

print('原始输入:', input_word)

print('\nSource')
print('  Word 编号:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_character_to_int[i] for i in text])))

print('\nTarget')
print('  Word 编号:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_character[i] for i in answer_logits if i != pad])))

OSError: File ../model/trained_model.ckpt.meta does not exist.