In [1]:
import numpy as np
import time

import helper

source_path = 'data/letters_source.txt'
target_path = 'data/letters_target.txt'

source_sentences = helper.load_data(source_path)
target_sentences = helper.load_data(target_path)

### Source Sentences

In [2]:
print(source_sentences[:50].split('\n'))

['bsaqq', 'npy', 'lbwuj', 'bqv', 'kial', 'tddam', 'edxpjpg', 'nspv', 'huloz', '']


### Target Sentences

In [3]:
print(target_sentences[:50].split('\n'))

['abqqs', 'npy', 'bjluw', 'bqv', 'aikl', 'addmt', 'degjppx', 'npsv', 'hlouz', '']


## Pre Process
To do anything useful with it, we'll need to turn the each string into a list of characters<br>

Convert the words to ids

In [4]:
def extract_character_words(data):
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    
    set_words = set([character for line in data.split('\n') for character in line])
    word_to_int = {word: ind for ind, word in enumerate(special_words + list(set_words))}
    int_to_word = {ind: word for word, ind in word_to_int.items()}

    return int_to_word, word_to_int


# Building letter2int and int2letter
source_int_to_letter, source_letter_to_int = extract_character_words(source_sentences)
target_int_to_letter, target_letter_to_int = extract_character_words(target_sentences)

# Convert Character to ids
source_letter_ids = [[source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in line] for line in source_sentences.split('\n')]
target_letter_ids = [[target_letter_to_int.get(word, target_letter_to_int['<UNK>']) for word in line] for line in target_sentences.split('\n')]

In [5]:
print(source_letter_ids[:3])
print(target_letter_ids[:3])

[[12, 10, 16, 23, 23], [26, 19, 13], [24, 12, 6, 27, 25]]
[[16, 12, 23, 23, 10], [26, 19, 13], [12, 25, 24, 27, 6]]


## Model

### Check the version of TensorFlow
This will check to make sure that you have correct version of TensorFlow

In [6]:
from distutils.version import LooseVersion
import tensorflow as tf
from tensorflow.python.layers.core import Dense

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.1'), 'Please use TensorFlow version 1.1 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.1.0


## Hyperparameters

In [7]:
# Number of Epochs
epochs = 60
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 50
# Number of Layers
num_layers = 2
# Embedding Size
encoding_embedding_size = 15
decoding_embedding_size = 15
# Learning Rate
learning_rate = 0.001

## Input

In [8]:
def get_model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name="input")
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='learning_rate')
    
    target_sequence_length = tf.placeholder(tf.int32, (None,), name="target_seq_length")
    max_target_sequence_length = tf.reduce_max(tf.int32, (None,), name="max_target_len")
    source_sequence_length = tf.placeholder(tf.int32, (None,), name="source_seq_length")
    
    return input_data, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_length