# Character Sequence to Sequene

Learn seq2seq model to sort the char sequence.

### Read data

In [1]:
with open('./data/letters_source.txt', 'r', encoding = 'utf-8') as f:
    source_sentences = f.read()

In [2]:
with open('./data/letters_target.txt', 'r', encoding = 'utf-8') as f:
    target_sentences = f.read()

In [4]:
target_sentences[:10]

'abqqs\nnpy\n'

## Preprocess

In [8]:
def extract_character_vocab(data):
    special_words = ['<pad>', '<unk>', '<s>',  '<\s>']
    set_words = set([character for line in data.split('\n') for character in line])
    idx2word = {idx: word for idx, word in enumerate(special_words+list(set_words))}
    word2idx = {word: idx for idx, word in idx2word.items()}
    return idx2word, word2idx

In [13]:
idx2word, word2idx = extract_character_vocab(target_sentences+source_sentences)

In [18]:
source_letter_idx = [[word2idx[word] for word in line] for line in source_sentences.split('\n')]
target_letter_idx = [[word2idx[word] for word in line] for line in target_sentences.split('\n')]

In [21]:
source_letter = [[word for word in line] for line in source_sentences.split('\n')]
target_letter = [[word for word in line] for line in target_sentences.split('\n')]

In [19]:
source_letter_idx[:3]

[[12, 27, 28, 13, 13], [23, 22, 5], [18, 12, 8, 7, 26]]

In [22]:
source_letter[:3]

[['b', 's', 'a', 'q', 'q'], ['n', 'p', 'y'], ['l', 'b', 'w', 'u', 'j']]

In [20]:
target_letter_idx[:3]

[[28, 12, 13, 13, 27], [23, 22, 5], [12, 26, 18, 7, 8]]

In [23]:
target_letter[:3]

[['a', 'b', 'q', 'q', 's'], ['n', 'p', 'y'], ['b', 'j', 'l', 'u', 'w']]

In [28]:
# determine the longest sequence size in the dataset
# pad all sequences to that length
def pad_id_sequences(source_idx, target_idx, word2idx, seq_length):
    new_source_idx = [ sentence + [word2idx['<pad>']]*(sequence_length - len(sentence)) for sentence in source_idx]
    new_target_idx = [ sentence + [word2idx['<pad>']]*(sequence_length - len(sentence)) for sentence in target_idx]
    return new_source_idx, new_target_idx
# Use the longest sequence as sequence length
sequence_length = max(
        [len(sentence) for sentence in source_letter_idx] + [len(sentence) for sentence in target_letter_idx])
source_ids, target_ids = pad_id_sequences(source_letter_idx, target_letter_idx, word2idx, sequence_length)

In [31]:
print("Sequence Length is {}".format(sequence_length))
print("Input sequence example")
print(source_ids[:3])
print("Target sequence example")
print(target_ids[:3])

Sequence Length is 7
Input sequence example
[[12, 27, 28, 13, 13, 0, 0], [23, 22, 5, 0, 0, 0, 0], [18, 12, 8, 7, 26, 0, 0]]
Target sequence example
[[28, 12, 13, 13, 27, 0, 0], [23, 22, 5, 0, 0, 0, 0], [12, 26, 18, 7, 8, 0, 0]]


## Model

In [32]:
import tensorflow as tf

In [33]:
tf.__version__

'1.0.1'

In [34]:
# Set hyperparameter
epochs = 60
batch_size = 128
rnn_size = 50
num_layers = 2
encoding_embedding_size = 13
decoding_embedding_size = 13
learning_rate = 0.001

In [35]:
input_data = tf.placeholder(tf.int32, [batch_size, sequence_length])
target = tf.placeholder(tf.int32, [batch_size, sequence_length])

### Encoding

In [36]:
vocab_size = len(word2idx)
enc_embed_input = tf.contrib.layers.embed_sequence(input_data,
                                                   vocab_size,
                                                   encoding_embedding_size)
enc_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnn_size)]* num_layers)
_, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_embed_input, dtype = tf.float32)

### Process Decoding Input

In [39]:
import numpy as np

ending = tf.strided_slice(target, [0,0], [batch_size, -1], [1,1])
dec_input = tf.concat([tf.fill([batch_size, 1], word2idx['<s>']), ending],1)

### Decoding
* Build the output layer in the decoding scope, so the weight and bias can be shared between the training and inference decoders.

In [41]:
dec_embeddings = tf.Variable(tf.random_uniform([vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)

dec_cell = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.BasicLSTMCell(rnn_size)]*num_layers)

with tf.variable_scope('decoding') as decoding_scope:
    output_fn = lambda x: tf.contrib.layers.fully_connected(x, vocab_size, None, scope = decoding_scope)

#### Decoder During Training

In [44]:
with tf.variable_scope("decoding") as decoding_scope:
    train_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_train(enc_state)
    train_pred,_,_ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, train_decoder_fn,
                                           dec_embed_input, sequence_length,
                                           scope = decoding_scope)
    train_logits = output_fn(train_pred)

#### Decoder During Inference
* reuse the weights the bias from training decoder using tf.variable_scope("decoding", reuse=True)

In [45]:
with tf.variable_scope("decoding", reuse = True) as decoding_scope:
    infer_decoder_fn = tf.contrib.seq2seq.simple_decoder_fn_inference(
        output_fn, enc_state, dec_embeddings,  target_letter_to_int['<s>'], 
        target_letter_to_int['<\s>'], sequence_length-1, vocab_size)
    inference_logits, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(dec_cell, infer_decoder_fn, scope=decoding_scope)

### Optimization

In [48]:
cost = tf.contrib.seq2seq.sequence_loss(
    train_logits,
    target,
    tf.ones([batch_size, sequence_length]))
optimizer = tf.train.AdamOptimizer(learning_rate)

gradients = optimizer.compute_gradients(cost)

capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)

### Train

In [51]:
import numpy as np
train_source = source_ids[batch_size:]
train_target = target_ids[batch_size:]

valid_source = source_ids[:batch_size]
valid_target = target_ids[:batch_size]

In [60]:
def batch_data(source, target, batch_size):
    """
    Batch source and target together
    """
    for batch_i in range(0, len(source)//batch_size):
        start_i = batch_i * batch_size
        source_batch = source[start_i:start_i + batch_size]
        target_batch = target[start_i:start_i + batch_size]
        yield source_batch, target_batch

In [61]:
sess =  tf.Session()
sess.run(tf.global_variables_initializer())

for epoch_i in range(epochs):
    for batch_i, (source_batch, target_batch) in enumerate(
            batch_data(train_source, train_target, batch_size)):
        _, loss = sess.run(
            [train_op, cost],
            {input_data: source_batch, target: target_batch})
        batch_train_logits = sess.run(
            inference_logits,
            {input_data: source_batch})
        batch_valid_logits = sess.run(
            inference_logits,
            {input_data: valid_source})

        train_acc = np.mean(np.equal(target_batch, np.argmax(batch_train_logits, 2)))
        valid_acc = np.mean(np.equal(valid_target, np.argmax(batch_valid_logits, 2)))
        print('Epoch {:>3} Batch {:>4}/{} - Train Accuracy: {:>6.3f}, Validation Accuracy: {:>6.3f}, Loss: {:>6.3f}'
              .format(epoch_i, batch_i, len(source_ids) // batch_size, train_acc, valid_acc, loss))

Epoch   0 Batch    0/78 - Train Accuracy:  0.013, Validation Accuracy:  0.009, Loss:  3.404
Epoch   0 Batch    1/78 - Train Accuracy:  0.035, Validation Accuracy:  0.038, Loss:  3.380
Epoch   0 Batch    2/78 - Train Accuracy:  0.461, Validation Accuracy:  0.449, Loss:  3.351
Epoch   0 Batch    3/78 - Train Accuracy:  0.450, Validation Accuracy:  0.449, Loss:  3.327
Epoch   0 Batch    4/78 - Train Accuracy:  0.446, Validation Accuracy:  0.449, Loss:  3.298
Epoch   0 Batch    5/78 - Train Accuracy:  0.445, Validation Accuracy:  0.449, Loss:  3.265
Epoch   0 Batch    6/78 - Train Accuracy:  0.432, Validation Accuracy:  0.449, Loss:  3.243
Epoch   0 Batch    7/78 - Train Accuracy:  0.432, Validation Accuracy:  0.449, Loss:  3.205
Epoch   0 Batch    8/78 - Train Accuracy:  0.417, Validation Accuracy:  0.449, Loss:  3.171
Epoch   0 Batch    9/78 - Train Accuracy:  0.451, Validation Accuracy:  0.449, Loss:  3.106
Epoch   0 Batch   10/78 - Train Accuracy:  0.402, Validation Accuracy:  0.449, L

### Prediction

In [62]:
input_sentence = 'hello'

input_sentence = [source_letter_to_int.get(word, word2idx['<unk>']) for word in input_sentence.lower()]
input_sentence = input_sentence + [0] * (sequence_length - len(input_sentence))
batch_shell = np.zeros((batch_size, sequence_length))
batch_shell[0] = input_sentence
chatbot_logits = sess.run(inference_logits, {input_data: batch_shell})[0]

print('Input')
print('  Word Iads:      {}'.format([i for i in input_sentence]))
print('  Input Words: {}'.format([word2idx[i] for i in input_sentence]))

print('\nPrediction')
print('  Word Ids:      {}'.format([i for i in np.argmax(chatbot_logits, 1)]))
print('  Chatbot Answer Words: {}'.format([word2idx[i] for i in np.argmax(chatbot_logits, 1)]))

Input
  Word Iads:      [6, 25, 18, 18, 16, 0, 0]


KeyError: 6