In [3]:
import numpy as np
from random import shuffle, seed

The following string generator is known as a Embedded Reber Grammar:
<img src="images/embreber.gif" style="width:700px;height:500px;">

In [7]:
def generate_reber_string(random_seed = 0):
    np.random.seed(random_seed)
    edge_char = np.random.choice(['T', 'P'])
    reber_string = 'B' + edge_char + 'B'
    states_dict = {'0': ['1', '2'], '1': ['1', '3'], '2': ['2', '4'], '3': ['2', '5'], '4': ['3', '5'], '5': ['6']}
    output_dict = {'0->1': 'T', '0->2': 'P', '1->1': 'S', '1->3': 'X',  '2->2': 'T', 
                   '2->4': 'V', '3->2': 'X', '3->5': 'S', '4->3': 'P', '4->5': 'V', '5->6': 'E'}
    current_state = '0'
    ch = ''
    while(ch != 'E'):
        previous_state = current_state
        current_state = np.random.choice(states_dict[current_state])
        transition = previous_state + "->" + current_state
        ch = output_dict[transition]
        reber_string = reber_string + ch
    reber_string = reber_string + edge_char + 'E'
    return reber_string

def generate_incorrect_reber_string(random_seed = 0):
    alphabet = set(['B', 'T', 'P', 'S', 'X', 'V', 'E'])
    reber_string = generate_reber_string(random_seed)
    make_error_flg = 1
    string_len = len(reber_string)
    correct_elements_indices = set(np.arange(string_len))
    while(make_error_flg == 1 and len(correct_elements_indices) >= 1):
        err_index = np.random.choice(list(correct_elements_indices))
        correct_elements_indices = correct_elements_indices - set([err_index])
        incorrect_letters = list(alphabet - set([reber_string[err_index]]))
        reber_string = reber_string[:err_index] + np.random.choice(incorrect_letters) + reber_string[err_index+1:]
        make_error_flg = np.random.choice([0, 1])
    return reber_string

In [75]:
print("correct string:   ", generate_reber_string())
print("incorrect string: ", generate_incorrect_reber_string())

correct string:    BTBPVPSETE
incorrect string:  BTBBVPBETE


In [147]:
def gen_n_strings(n, string_gen_func):
    max_int = np.iinfo(np.int32).max
    return [string_gen_func(np.random.randint(max_int)) for i in range(n)]


def get_seq_lengths_from_X(X):
    seq_lengths = [len(x) for x in X]
    return seq_lengths

In [152]:
#set n correct & n incorrect strings
n_correct_train = 15000
n_incorrect_train = 15000
n_correct_test = 15000
n_incorrect_test = 15000
n_correct = n_correct_train + n_correct_test
n_incorrect = n_incorrect_train + n_incorrect_test
n_train = n_correct_train + n_incorrect_train

#generate correct & incorrect strings
X_all = np.concatenate([gen_n_strings(n_correct, generate_reber_string),
                        gen_n_strings(n_incorrect, generate_incorrect_reber_string)])
y_all = np.hstack((np.ones(n_correct), np.zeros(n_incorrect))).astype(int)

#shuffle data
seed(0)            
shuffle(X_all)
seed(0)
shuffle(y_all)
X_train, y_train = X_all[:n_train], y_all[:n_train]
X_test, y_test = X_all[n_train:], y_all[n_train:]
print('\nX_train length:', len(X_train), '\ny_train shape:', y_train.shape)
print('X_test length:', len(X_test), '\ny_test shape:', y_test.shape)


X_train length: 30000 
y_train shape: (30000,)
X_test length: 30000 
y_test shape: (30000,)


In [153]:
#check generated data!
for i in range(10):
    print(X_all[i], y_all[i])

BTBTXSETE 1
BBSPTVETE 0
BTBTXXTTTVVETE 1
BTBPTVPXVPXTVTSTE 0
BTBTXSETE 1
BPBBXPSEPE 0
BTBPVVETE 1
BTSPTSPPTVVSVE 0
BPBPTVPXVVEPP 0
BTBTSXSETE 1


In [154]:
seq_length_train = get_seq_lengths_from_X(X_train)
seq_length_test = get_seq_lengths_from_X(X_test)

In [155]:
#convert to one-hot encoding
one_hot_dict = {'B': [1,0,0,0,0,0,0], 'T': [0,1,0,0,0,0,0], 'P': [0,0,1,0,0,0,0],
                'S': [0,0,0,1,0,0,0], 'X': [0,0,0,0,1,0,0], 'V': [0,0,0,0,0,1,0], 'E': [0,0,0,0,0,0,1]}
X_train = [[one_hot_dict[ch] for ch in x] for x in X_train]
X_test = [[one_hot_dict[ch] for ch in x] for x in X_test]
    
y_train = np.reshape(y_train, (y_train.shape[0], 1))
y_test = np.reshape(y_test, (y_test.shape[0], 1))
print('X_train shape:', len(X_train), 'X_test shape:', len(X_test))
print('y_train shape:', y_train.shape, 'y_test shape:', y_test.shape)

X_train shape: 30000 X_test shape: 30000
y_train shape: (30000, 1) y_test shape: (30000, 1)


In [161]:
import tensorflow as tf

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
reset_graph()

In [162]:
n_inputs = len("BTPSXVE")
n_neurons = 64
n_outputs = 1

n_epochs = 10
lr = 1e-3
batch_size = 128
n_batches = int(np.ceil(len(X_train) / batch_size))

g = tf.Graph()
with g.as_default(): 
    with tf.name_scope("LSTM"):
        X = tf.placeholder(tf.float32, [None, None, n_inputs])
        y = tf.placeholder(tf.float32, [None, 1])
        seq_length = tf.placeholder(tf.int32, [None])
        learning_rate = tf.placeholder(tf.float32)
        
        lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=n_neurons)
        outputs, states = tf.nn.dynamic_rnn(lstm_cell, X, dtype=tf.float32, sequence_length= seq_length, swap_memory= True)
        logits = tf.layers.dense(states[0], n_outputs)
        
        xentropy= tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
        
        y_pred = tf.cast(tf.greater(logits, 0.), tf.float32, name="y_pred")
        y_proba = tf.nn.sigmoid(logits, name="y_proba")
        
        equality = tf.equal(y_pred, y)
        accuracy = tf.reduce_mean(tf.cast(equality, tf.float32))
        
        training_op = optimizer.minimize(loss)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        
with tf.Session(graph = g) as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            batch_start_idx = batch_index * batch_size
            batch_end_idx = batch_start_idx + batch_size
            X_batch, y_batch, seq_length_batch = X_train[batch_start_idx: batch_end_idx],\
                                                 y_train[batch_start_idx: batch_end_idx],\
                                                 seq_length_train[batch_start_idx: batch_end_idx]
            X_batch = np.array(pad(X_batch))
            loss_val, acc_train, _=  sess.run([loss, accuracy, training_op],\
                                        feed_dict={X: X_batch, y: y_batch, seq_length: seq_length_batch, learning_rate: lr})

        print("Epoch:", epoch+1, "\tLoss:", loss_val)
        acc_val = sess.run(accuracy, feed_dict = {X: np.array(pad(X_test)), y: y_test, seq_length: seq_length_test} )
        print("Accuracy on test data:", acc_train, "Accuracy on validation data:", acc_val)
    saver.save(sess, "./my_reber_classifier")

Epoch: 1 	Loss: 0.38439107
Accuracy on test data: 0.8958333 Accuracy on validation data: 0.8355
Epoch: 2 	Loss: 0.36810723
Accuracy on test data: 0.8958333 Accuracy on validation data: 0.87516665
Epoch: 3 	Loss: 0.2761153
Accuracy on test data: 0.8958333 Accuracy on validation data: 0.906
Epoch: 4 	Loss: 0.26540756
Accuracy on test data: 0.8958333 Accuracy on validation data: 0.91646665
Epoch: 5 	Loss: 0.18664515
Accuracy on test data: 0.9375 Accuracy on validation data: 0.94776666
Epoch: 6 	Loss: 0.11891573
Accuracy on test data: 0.9583333 Accuracy on validation data: 0.95023334
Epoch: 7 	Loss: 0.09218042
Accuracy on test data: 0.9166667 Accuracy on validation data: 0.95523334
Epoch: 8 	Loss: 0.06481909
Accuracy on test data: 0.9583333 Accuracy on validation data: 0.96213335
Epoch: 9 	Loss: 0.12437004
Accuracy on test data: 0.9375 Accuracy on validation data: 0.9486667
Epoch: 10 	Loss: 0.0244302
Accuracy on test data: 1.0 Accuracy on validation data: 0.9820667


In [165]:
#Test random string
random_number = np.random.randint(low=999999, high=np.iinfo(np.int32).max)
y_sample = [np.random.randint(2)]
print('y_sample =', y_sample)
if y_sample == [1]:
    init_string_sample = generate_reber_string(random_number)
else: 
    init_string_sample = generate_incorrect_reber_string(random_number)
X_sample = np.array([one_hot_dict[ch] for ch in list(init_string_sample)])
X_sample = np.reshape(X_sample, (1, -1, n_inputs))
seq_length_sample = seq_lengths_from_X(X_sample)
with tf.Session(graph = g) as sess:
    saver.restore(sess, "./my_reber_classifier");
    y_proba_sample = y_proba.eval(feed_dict={X: X_sample, seq_length: np.array(seq_length_sample)})
print("Estimated probability that", init_string_sample, "is an Embedded Reber strings: {:.2f}%".format(y_proba_sample[0][0]*100))

y_sample = [1]
INFO:tensorflow:Restoring parameters from ./my_reber_classifier
Estimated probability that BTBPTVPSETE is an Embedded Reber strings: 98.58%
