# 神经机器翻译系统

用TensorFlow搭建seq2seq模型实现了一个简单的神经机器翻译系统，实现英语翻译为法语。Encoder使用双向LSTM。Decoder采用了attention机制。

使用多张计算图分别处理train，eval和infer，并分别在不同的session中进行训练和推断。参数共享用Saver。

增加了tensorboard可视化。decoder的initial state采用双向encoder state的平均值。

详见TensorFlow教程：https://tensorflow.google.cn/tutorials/seq2seq

### 导入包
检查TensorFlow版本和GPU情况

In [1]:
from distutils.version import LooseVersion
import warnings, os
import numpy as np
import tensorflow as tf
from tqdm import tqdm

# Check TensorFlow Version
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

  from ._conv import register_converters as _register_converters


TensorFlow Version: 1.8.0


  if sys.path[0] == '':


### 超参数设置

In [2]:
source_path = 'en-fr/small_vocab_en'
target_path = 'en-fr/small_vocab_fr'
checkpoint_path = './tmp-model.ckpt'
batch_size = 256
num_units = 32
num_layers = 2
max_gradient_norm = 5.0
learning_rate = 0.001
epoch = 3

### 建立lookup table文件
sos句子开始。eos句子结束。

In [3]:
l=[]
with open(source_path, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        l += line.split()
f.close()
unique_words_src = ['eos'] + list(set(l))

with open('en-fr/words_en', 'w', encoding='utf-8') as f:
    for word in unique_words_src:
        f.write(word + '\n')
f.close()

In [4]:
l=[]
with open(target_path, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        l += line.split()
f.close()
unique_words_tar = ['sos'] + ['eos'] + list(set(l))
print(len(l), len(unique_words_tar))
#print ('rusty' in unique_words_en)

with open('en-fr/words_fr', 'w', encoding='utf-8') as f:
    for word in unique_words_tar:
        f.write(word + '\n')
f.close()

1961295 357


### 使用预训练词向量 FastText

In [5]:
embed_file_src = os.path.join('.', 'fasttext', 'wiki-news-300d-1M.vec')
embed_file_tar = os.path.join('.', 'fasttext', 'cc.fr.300.vec')
embed_size = 300

In [6]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index_src = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embed_file_src, encoding = 'utf-8'))
embeddings_index_tar = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embed_file_tar, encoding = 'utf-8'))

#### 建立两种语言的embedding matrix

In [7]:
embedding_matrix_src = np.random.normal(size=(len(unique_words_src), embed_size), scale=0.01)
embedding_matrix_tar = np.random.normal(size=(len(unique_words_tar), embed_size), scale=0.01)

for i, word in enumerate(unique_words_src):
    embedding_vector = embeddings_index_src.get(word)
    if embedding_vector is not None: embedding_matrix_src[i] = embedding_vector
        
for i, word in enumerate(unique_words_tar):
    embedding_vector = embeddings_index_tar.get(word)
    if embedding_vector is not None: embedding_matrix_tar[i] = embedding_vector

### 生成lookup table函数

In [8]:
def BuildLookupTable(source_words_path, target_words_path):
    lookup_src = tf.contrib.lookup.index_table_from_file(source_words_path)
    lookup_tar = tf.contrib.lookup.index_table_from_file(target_words_path)
    lookup_translate = tf.contrib.lookup.index_to_string_table_from_file(target_words_path)
    return lookup_src, lookup_tar, lookup_translate

### 输入训练文本预处理函数
预处理source和target dataset。文本转成单词id。target开头加一个sos。分batch并pad。末尾用eos补足到最大长度。
这里不需要drop remainder，iterator会自动计算最后一批的样本量。但是后面不能再使用batch size

In [9]:
def BuildTrainDataset(source_path, target_path, lookup_src, lookup_tar, src_eos_id, tar_eos_id):
    
    # source
    source_dataset = tf.data.TextLineDataset(source_path)
    source_dataset = source_dataset.map(lambda string: tf.string_split([string]).values)
    source_dataset = source_dataset.map(lambda words: (words, tf.size(words)))
    source_dataset = source_dataset.map(lambda words, size: (lookup_src.lookup(words), size))
    
    # target
    target_dataset = tf.data.TextLineDataset(target_path)
    target_dataset = target_dataset.map(lambda string: tf.string_split([tf.string_join([tf.constant('sos'), string], separator=' ')]).values)
    target_dataset = target_dataset.map(lambda words: (words, tf.size(words)))
    target_dataset = target_dataset.map(lambda words, size: (lookup_tar.lookup(words), size))
    
    # zip source and target
    source_target_dataset = tf.data.Dataset.zip((source_dataset, target_dataset))

    # batch and pad
    batched_dataset = source_target_dataset.padded_batch(
        batch_size,
        padded_shapes=((tf.TensorShape([None]),  # source vectors of unknown size
                        tf.TensorShape([])),     # size(source)
                       (tf.TensorShape([None]),  # target vectors of unknown size
                        tf.TensorShape([]))),    # size(target)
        padding_values=((src_eos_id,  # source vectors padded on the right with src_eos_id
                         0),          # size(source) -- unused
                        (tar_eos_id,  # target vectors padded on the right with tar_eos_id
                         0)))         # size(target) -- unused
    
    return batched_dataset

### Build the train model function
Input: batched and padded dataset iterator ((source, source_lengths), (target, target_lengths))

Output: command to run in train session

这里注意state的形状，传给decoder要对应

In [10]:
def BuildTrainModel(train_iterator):
    ((source, source_lengths), (target, target_lengths)) = train_iterator.get_next()
    encoder_inputs = tf.transpose(source, [1,0]) # to time major
    decoder_inputs = tf.transpose(target, [1,0])
    decoder_outputs = tf.pad(decoder_inputs[1:], tf.constant([[0,1],[0,0]]), constant_values=tar_eos_id)

    shape = tf.shape(decoder_outputs)
    target_weights = tf.to_double(tf.where(tf.equal(decoder_outputs, tf.fill(shape, tar_eos_id)), tf.zeros(shape), tf.ones(shape)))
            
    embedding_encoder = tf.Variable(embedding_matrix_src, name='embedding_encoder')
    embedding_decoder = tf.Variable(embedding_matrix_tar, name='embedding_decoder')
    
    # Embedding layer
    encoder_emb_inp = tf.nn.embedding_lookup(embedding_encoder, encoder_inputs)
    decoder_emb_inp = tf.nn.embedding_lookup(embedding_decoder, decoder_inputs)
    
    # Encoder
    # Construct forward and backward cells
    forward_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
    backward_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
    
    encoder_outputs, encoder_states_fw, encoder_states_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
        [forward_cell] * num_layers, [backward_cell] * num_layers, encoder_emb_inp, dtype=tf.float64, 
        sequence_length=source_lengths, time_major=True)
    #encoder_states: the final states, one tensor per layer, of the forward/backward rnn

    # Attention
    attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
        num_units, attention_states, memory_sequence_length=source_lengths, dtype=tf.float64)
    decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, name = 'decoder_cell')
    decoder_cell = tf.contrib.rnn.MultiRNNCell([decoder_cell] * num_layers)
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=num_units)
    initial_state = decoder_cell.zero_state(dtype=tf.float64, batch_size=tf.shape(encoder_inputs)[1])
    initial_state = initial_state.clone(
        cell_state = encoder_states_fw)
    
    # Projection layer on the top
    projection_layer = tf.layers.Dense(len(unique_words_tar), use_bias=False, name='projection')
    
    # Decoder for training
    helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, target_lengths, time_major=True)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, initial_state, output_layer=projection_layer)
    outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, impute_finished=True)
    logits = outputs.rnn_output
    
    # Loss
    crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=decoder_outputs, logits=logits)
    train_loss = tf.reduce_sum(crossent * target_weights)/ tf.to_double(tf.shape(encoder_inputs)[1])
    tf.summary.scalar('train_loss', train_loss)
    
    # Gradient
    params = tf.trainable_variables()
    gradients = tf.gradients(train_loss, params)
    clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
    
    # Optimization
    optimizer = tf.train.AdamOptimizer(learning_rate)
    update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
    
    return train_loss, update_step

### 设置train graph

In [11]:
train_graph = tf.Graph()
with train_graph.as_default():
    
    # Build the lookup table
    lookup_src, lookup_tar, lookup_translate = BuildLookupTable('en-fr/words_en', 'en-fr/words_fr')
    
    # set the sos and eos
    src_eos_id=lookup_src.lookup(tf.constant('eos')) #0 in source vocab
    tar_sos_id=lookup_tar.lookup(tf.constant('sos')) #0 in target vocab
    tar_eos_id=lookup_tar.lookup(tf.constant('eos')) #1 in target vocab
    
    # Preprocess the text dataset
    batched_dataset = BuildTrainDataset(source_path, target_path, lookup_src, lookup_tar, src_eos_id, tar_eos_id)
    
    # Build the train model
    train_iterator = batched_dataset.make_initializable_iterator()
    train_model = BuildTrainModel(train_iterator)
    initializer = tf.global_variables_initializer()
    table_initializer = tf.tables_initializer()
    train_saver = tf.train.Saver(max_to_keep=2)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter('./vis/train', train_graph)

ValueError: Dimensions must be equal, but are 96 and 332 for 'stack_bidirectional_rnn/cell_0/bidirectional_rnn/fw/fw/while/basic_lstm_cell/MatMul_1' (op: 'MatMul') with input shapes: [?,96], [332,128].

### Run the train session to train the model and save the variables

In [None]:
train_sess = tf.Session(graph=train_graph)
train_sess.run(initializer)
train_sess.run(table_initializer)
train_sess.run(train_iterator.initializer)

#saver.restore(sess, './tmp-model.ckpt-11')
for i in tqdm(range(epoch)):
    train_sess.run(train_iterator.initializer)
    n_batch=0
    while True:
        try:
            summary, (cost, _) = train_sess.run([merged, train_model])
            train_writer.add_summary(summary)
            n_batch+=1
            print (n_batch)
            if n_batch % 10 == 0:
                print (cost)
        except tf.errors.OutOfRangeError:
            print (cost)
            break
    model_path = train_saver.save(train_sess, checkpoint_path, global_step=i+1)

### 输入测试文本预处理函数
预处理用于infer的source dataset。文本转成单词id。分batch并pad。末尾用eos补足到最大长度。
这里不需要drop remainder，iterator会自动计算最后一批的样本量。但是后面不能再使用batch size

In [None]:
def BuildTestDataset(source_path, lookup_src, src_eos_id):
    
    # source
    source_dataset = tf.data.TextLineDataset(source_path)
    source_dataset = source_dataset.map(lambda string: tf.string_split([string]).values)
    source_dataset = source_dataset.map(lambda words: (words, tf.size(words)))
    source_dataset = source_dataset.map(lambda words, size: (lookup_src.lookup(words), size))

    # batch and pad
    batched_dataset = source_dataset.padded_batch(
        batch_size,
        padded_shapes=(tf.TensorShape([None]),  # source vectors of unknown size
                        tf.TensorShape([])),     # size(source)
        padding_values=(src_eos_id,  # source vectors padded on the right with src_eos_id
                         0))          # size(source) -- unused
    
    return batched_dataset

### Build the infer model function
Input: batched and padded dataset iterator ((source, source_lengths), (target, target_lengths))

Output: command to run in infer session

In [None]:
def BuildInferModel(test_iterator, tar_sos_id, tar_eos_id):
    (source, source_lengths) = test_iterator.get_next()
    encoder_inputs = tf.transpose(source, [1,0])
    
    embedding_encoder = tf.Variable(embedding_matrix_src, name='embedding_encoder')
    embedding_decoder = tf.Variable(embedding_matrix_tar, name='embedding_decoder')
    
    # Embedding layer
    encoder_emb_inp = tf.nn.embedding_lookup(embedding_encoder, encoder_inputs)
    
    # Encoder
    # Construct forward and backward cells
    forward_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, name = 'forward_cell')
    backward_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, name = 'backward_cell')

    bi_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
        forward_cell, backward_cell, encoder_emb_inp, dtype=tf.float64,
        sequence_length=source_lengths, time_major=True)
    encoder_outputs = tf.concat(bi_outputs, -1)
    #encoder_state: tuple of 2 LSTM state tuple ((cell_state_fw, hidden_state_fw), (cell_state_bw, hidden_state_bw))
    
    # Attention
    attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
        num_units, attention_states, memory_sequence_length=source_lengths, dtype=tf.float64)
    decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, name = 'decoder_cell')
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=num_units)
    initial_state = decoder_cell.zero_state(dtype=tf.float64, batch_size=tf.shape(encoder_inputs)[1])
    initial_state = initial_state.clone(
        cell_state = tf.contrib.rnn.LSTMStateTuple((encoder_state[0].c+encoder_state[1].c)/2.0, (encoder_state[0].h+encoder_state[1].h)/2.0))
    
    # Projection layer on the top
    projection_layer = tf.layers.Dense(len(unique_words_tar), use_bias=False, name='projection')
    
    # Decoder to infer
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
        embedding_decoder, tf.fill([tf.shape(encoder_inputs)[1]], tf.to_int32(tar_sos_id)), tf.to_int32(tar_eos_id))
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, infer_helper, initial_state, output_layer=projection_layer)
    maximum_iterations = tf.round(tf.reduce_max(source_lengths) * 2)
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        decoder, maximum_iterations=maximum_iterations, output_time_major=True, impute_finished=True)
    
    translation_id = tf.to_int64(outputs.sample_id)
    translation = lookup_translate.lookup(translation_id)
    
    return translation

### 设置infer graph

In [None]:
infer_graph = tf.Graph()
with infer_graph.as_default():
    
    # Build the lookup table
    lookup_src, lookup_tar, lookup_translate = BuildLookupTable('en-fr/words_en', 'en-fr/words_fr')
    
    # set the sos and eos
    src_eos_id=lookup_src.lookup(tf.constant('eos')) #0 in source vocab
    tar_sos_id=lookup_tar.lookup(tf.constant('sos')) #0 in target vocab
    tar_eos_id=lookup_tar.lookup(tf.constant('eos')) #1 in target vocab
    
    # Preprocess the text dataset
    batched_dataset = BuildTestDataset(source_path, lookup_src, src_eos_id)
    
    # Build the train model
    test_iterator = batched_dataset.make_initializable_iterator()
    infer_model = BuildInferModel(test_iterator, tar_sos_id, tar_eos_id)
    infer_saver = tf.train.Saver()
    table_initializer = tf.tables_initializer()
    infer_writer = tf.summary.FileWriter('./vis/infer', infer_graph)

### Run the infer session to translate new sentences

In [None]:
infer_sess = tf.Session(graph=infer_graph)
infer_sess.run(table_initializer)
infer_saver.restore(infer_sess, model_path)
infer_sess.run(test_iterator.initializer)
n_batch=0
f = open('en-fr/trans_fr', 'w', encoding='utf-8')
while True:
    try:
        tar_sentences = infer_sess.run(infer_model)
        n_batch+=1
        tar_sentences = np.transpose(tar_sentences)
        for sentence in tar_sentences:
            for word in sentence:
                if word == 'eos': break
                f.write(word.decode('utf-8') + ' ')
            f.write('\n')
        print(n_batch)
    except tf.errors.OutOfRangeError:
        break
f.close()

# Close the session manually and release resources
train_sess.close()
infer_sess.close()

saver = tf.train.Saver()
with tf.Session() as sess: #debug
    sess.run(tf.global_variables_initializer())
    tf.tables_initializer().run()
    sess.run(batched_iterator.initializer)
    n_batch=0
    ei, di, do = sess.run([encoder_inputs, decoder_inputs, decoder_outputs])
            #print (np.shape(ei), np.shape(di), np.shape(do))
    n_batch+=1
    print(n_batch)
    model_path = saver.save(sess, './tmp-model.ckpt')