A simple autocomplete model  
Given 3 characters, pick the next logical, syntactical, semantical character

In [1]:
import numpy as np
import tensorflow as tf

#### Steps
1. Generate a dictionary of character-number {chr: num}  (character pool)
2. Generate instances of length-4-words (training data)
3. Generate batches (one-hot)
4. Set hyperparameters
5. Set placeholders
6. Set variables
7. Create LSTM cell
8. Train
9. Test

In [2]:
# Step 1
char_arr = list('abcdefghijklmnopqrstuvwxyz')
num_dic = {v: i for i, v in enumerate(char_arr)}
dic_len = len(num_dic)

In [3]:
# Step 2
seq_data = ['word', 'wood', 'deep', 'dive', 'cold',
            'cool', 'load', 'love', 'kiss', 'kind',
            'deal', 'with', 'hate', 'halo', 'lone',
            'home', 'baby', 'here', 'soup', 'crab',
            'beam', 'bean', 'date', 'live', 'ring',
            'data', 'rear', 'fear', 'peel', 'ping',
            'beat', 'bite', 'sick', 'deem', 'tree']

In [4]:
# Step 3
def make_batch(seq_data):
    input_batch, target_batch = [], []
    
    for seq in seq_data:
        inpt = [num_dic[w] for w in seq[:-1]] # Remove fourth character
        target = num_dic[seq[-1]] # Get only fourth character
        # Each input number from num_dic becomes an one-hot vector
        input_batch.append(np.eye(dic_len)[inpt]) # Create identity matrix
        target_batch.append(target)
    return input_batch, target_batch

In [5]:
make_batch(seq_data)[0][0] # First three characters in vector (input_batch)

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [6]:
make_batch(seq_data)[1] # The fourth characters (target_batch)

[3,
 3,
 15,
 4,
 3,
 11,
 3,
 4,
 18,
 3,
 11,
 7,
 4,
 14,
 4,
 4,
 24,
 4,
 15,
 1,
 12,
 13,
 4,
 4,
 6,
 0,
 17,
 17,
 11,
 6,
 19,
 4,
 10,
 12,
 4]

In [7]:
# Step 4
learning_rate = 0.01
n_hidden = 128 # Number of hidden states
total_epoch = 30
n_step = 3 # Three characters to read
n_input = n_class = dic_len # Number of characters

In [8]:
# Step 5
with tf.device('/gpu:0'): # Run with GPU
    X = tf.placeholder(tf.float32, [None, n_step, n_input])
    Y = tf.placeholder(tf.int32, [None])
# Running with GPU is quite useless in this case because toy dataset is very small
# and the structure is very simple.
# But it is still written because, because. (Apparently just did  it for no reason.)

In [9]:
# Step 7
W = tf.Variable(tf.random_normal([n_hidden, n_class]))
b = tf.Variable(tf.random_normal([n_class]))

In [10]:
# Step 7
cell1 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
# A cell with dropout is usually stronger against noise
cell1 = tf.nn.rnn_cell.DropoutWrapper(cell1, output_keep_prob=0.5) # Let 50% pass
# A cell without dropout is usually stronger in clean dataset
cell2 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)

In [11]:
multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2])
outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)

In [12]:
outputs = tf.transpose(outputs, [1, 0, 2])
outputs = outputs[-1]
model = tf.matmul(outputs, W) + b

In [13]:
cost = tf.reduce_mean(
    tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=Y)
)

optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [14]:
# Step 8
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

input_batch, target_batch = make_batch(seq_data)

for epoch in range(total_epoch):
    _, loss = sess.run([optimizer, cost],
                       feed_dict={X: input_batch, Y: target_batch})
    print('Epoch: {:03}, Cost = {:.6f}'.format(epoch + 1, loss))

Epoch: 001, Cost = 3.646148
Epoch: 002, Cost = 3.058766
Epoch: 003, Cost = 2.426885
Epoch: 004, Cost = 2.195904
Epoch: 005, Cost = 1.678676
Epoch: 006, Cost = 1.407809
Epoch: 007, Cost = 1.166907
Epoch: 008, Cost = 0.849741
Epoch: 009, Cost = 0.733174
Epoch: 010, Cost = 0.652847
Epoch: 011, Cost = 0.490465
Epoch: 012, Cost = 0.456534
Epoch: 013, Cost = 0.540301
Epoch: 014, Cost = 0.498168
Epoch: 015, Cost = 0.389873
Epoch: 016, Cost = 0.565789
Epoch: 017, Cost = 0.505857
Epoch: 018, Cost = 0.293771
Epoch: 019, Cost = 0.480693
Epoch: 020, Cost = 0.332685
Epoch: 021, Cost = 0.379624
Epoch: 022, Cost = 0.442641
Epoch: 023, Cost = 0.411749
Epoch: 024, Cost = 0.322995
Epoch: 025, Cost = 0.404633
Epoch: 026, Cost = 0.313116
Epoch: 027, Cost = 0.360299
Epoch: 028, Cost = 0.313611
Epoch: 029, Cost = 0.249297
Epoch: 030, Cost = 0.287757


In [15]:
# Step 9
prediction = tf.cast(tf.argmax(model, 1), tf.int32)
prediction_check = tf.equal(prediction, Y)
accuracy = tf.reduce_mean(tf.cast(prediction_check, tf.float32))

In [16]:
test_data = ['kill', 'feel', 'bend', 'risk']

input_batch, target_batch = make_batch(test_data)

predict, accuracy_val = sess.run([prediction, accuracy],
                                 feed_dict={X: input_batch, Y: target_batch})
predict_words = []
for idx, val in enumerate(test_data):
    last_char = char_arr[predict[idx]]
    predict_words.append(val[:3] + last_char)
    
print('Inserted: ', [w[:3] for w in test_data])
print('Predicted: ', predict_words)
print('Accuracy: ', accuracy_val)

Inserted:  ['kil', 'fee', 'ben', 'ris']
Predicted:  ['kild', 'feer', 'benm', 'risg']
Accuracy:  0.0


Warning: Can't really expect any nice accuracy since toy dataset is too small to learn anything from it.