## **Imports**

In [1]:
import numpy as np
from keras.datasets import imdb
import matplotlib.pyplot as plt
from keras.preprocessing import sequence

In [5]:
!wget -P /data/imdb http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

/bin/sh: wget: command not found
unzip:  cannot find or open glove.6B.zip, glove.6B.zip.zip or glove.6B.zip.ZIP.


In [6]:
embeddings_index = {}
with open('data/imdb/glove.6B.50d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


## **Data Processing**

In [7]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [8]:
word2index = imdb.get_word_index()
index2word = dict([(value, key) for (key, value) in word2index.items()])
example_review = ' '.join([index2word.get(i - 3, '?') for i in train_data[0]])

In [9]:
example_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you th

In [10]:
hits = 0
misses = 0
inp_size = 50

# Prepare embedding matrix
embedding_matrix = {}
embedding_matrix[0] = np.zeros((inp_size,1))
for word, i in word2index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector.reshape((inp_size, 1))
        hits += 1
    else:
        embedding_matrix[i] = np.zeros((inp_size,1))
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 60150 words (28434 misses)


In [11]:
def filter(datax, datay, seq_len=300):
    filteredx, filteredy = [], []
    for i in range(len(datax)):
        if len(datax[i]) <= seq_len:
            filteredx.append(datax[i])
            filteredy.append(datay[i])
    return filteredx, filteredy

In [312]:
seq_len = 200

trainx, trainy = filter(train_data, train_labels, seq_len)
x_train = sequence.pad_sequences(trainx, maxlen=seq_len, padding='pre')
y_train = np.asarray(trainy).astype('float32')

testx, testy = filter(test_data, test_labels, seq_len)
x_test = sequence.pad_sequences(testx, maxlen=seq_len, padding='pre')
y_test = np.asarray(testy).astype('float32')

print('input_train shape:', np.array(x_train).shape)
print('input_test shape:', np.array(x_test).shape)

input_train shape: (14323, 200)
input_test shape: (14732, 200)


In [314]:
def data_generator(datax, datay, batch_size=128):
    """
    generates 1 batch of inputs
    1 batch = 128 sequences
    1 sequence = 100 time points / samples
    1 sample = 10,000 features
    inputs shape: (128, 100, 10000) or (batch_size, seq_len, num_features)
    targets shape: (128, 1), 1 target value for each sequence, many-to-one architecture
    """
    inputs, targets = {}, {}
    start = 0 # where to start drawing inputs, incremented by batch size * seq_len
    while True:
        if start + batch_size > len(datax):
            start = 0
        stop = start + batch_size
        indices = np.arange(start, stop)
        for i in range(len(indices)):
            inputs[i] = {}
            sequence = datax[indices[i]]
            for j in range(len(sequence)):
                inputs[i][j] = embedding_matrix[sequence[j]]
            targets[i] = datay[indices[i]]
        start = stop
        yield inputs, targets

In [315]:
b_size = 64
train_steps, val_steps, test_steps = 30, 10, 10
print('train len:', b_size*train_steps)
print('val len:', b_size*val_steps)
print('test len:', b_size*test_steps)

train len: 1920
val len: 640
test len: 640


In [316]:
train_gen = data_generator(x_train[:b_size*train_steps], 
                     y_train[:b_size*train_steps], batch_size=b_size)
val_gen = data_generator(x_test[:b_size*val_steps], 
                   y_test[:b_size*val_steps], batch_size=b_size)
test_gen = data_generator(x_test[b_size*val_steps:b_size*(val_steps+test_steps)], 
                    y_test[b_size*val_steps:b_size*(val_steps+test_steps)], batch_size=b_size)

## **Functions**

In [169]:
def sigmoid(x):                                     
    return 1.0 / (1.0 + np.exp(-x))

In [170]:
def gradient_clip(grads, max_norm, order=1):
    total_norm = 0.0
    if order > 1:
        for i in range(order):
            total_norm += np.linalg.norm(grads[0][-i-1])
    else:
        total_norm += np.linalg.norm(grads[0])
    for i in range(1, len(grads[1:])):
        total_norm += np.linalg.norm(grads[i])
    
    total_norm = np.sqrt(total_norm)
    if total_norm > max_norm:
        if order > 1:
            for i in range(order):
                grads[0][-i-1] *= max_norm / total_norm
        else:
            grads[0] *= max_norm / total_norm
        for i in range(1, len(grads[1:])):
            grads[i] *= max_norm / total_norm
    """
    # andrej
    for i in range(len(grads)):
        np.clip(grads[i], -5, 5, out=grads[i]) # clip to mitigate exploding gradients
    """
    return grads

In [171]:
def find_acc(outputs, targets):
    acc = 0.0
    for n in range(len(outputs)):
        acc += np.rint(outputs[n]) == targets[n]
    acc /= len(outputs)
    return acc

In [172]:
def find_loss(outputs, targets):
    loss = 0.0
    for n in range(len(outputs)):
        loss += -(targets[n]*np.log(outputs[n]) + (1 - targets[n])*np.log(1 - outputs[n]))
    loss /= len(outputs)
    return loss

In [173]:
def update_parameters(params, grads, lr, order=1):
    if order > 1:
        for i in range(order):
            params[0][-i-1] -= grads[0][-i-1] * lr
    else:
        params[0] -= grads[0] * lr
    for i in range(1, len(params[1:])):
        params[i] -= grads[i] * lr
    return params

## Vanilla RNN

In [309]:
class RNN():
    def __init__(self, hidden_size, input_size, out_size):
        V = np.random.randn(hidden_size, hidden_size)*0.1 # W_hh
        U = np.random.randn(hidden_size, input_size)*0.1  # W_hx
        W = np.random.randn(out_size, hidden_size)*0.1    # W_hy
        b_hidden = np.random.randn(hidden_size, 1)              # b_h
        b_out = np.random.randn(out_size, 1)                    # b_y
        self.params = [V, U, W, b_hidden, b_out]
        self.best = [V, U, W, b_hidden, b_out]
    
    def forward(self, inputs, hidden_state, best=False):
        if best: 
            V, U, W, b_hidden, b_out = self.best    
        else: 
            V, U, W, b_hidden, b_out = self.params
        outputs, hidden_states = {}, {}
        for n in range(len(inputs)):
            hidden_states[n] = {}
            hidden_states[n][-1] = np.copy(hidden_state)
            for t in range(len(inputs[n])):
                hidden_states[n][t] = np.tanh(np.dot(U, inputs[n][t]) + np.dot(V, hidden_states[n][t-1]) + b_hidden)
            outputs[n] = sigmoid(np.copy((np.dot(W, hidden_states[n][t]) + b_out).item()))
        return outputs, hidden_states

    def backward(self, inputs, outputs, hidden_states, targets, clip_norm):
        V, U, W, b_hidden, b_out = self.params
        dV, dU, dW = np.zeros_like(V), np.zeros_like(U), np.zeros_like(W)
        db_hidden, db_out = np.zeros_like(b_hidden), np.zeros_like(b_out)
        
        loss = 0.0
        N, T = len(inputs), len(inputs[0])
        for n in range(N): # iterate over sequences in a batch, 128
            loss += -(targets[n]*np.log(outputs[n]) + (1 - targets[n])*np.log(1 - outputs[n]))
            do = outputs[n] - targets[n]
            dW += np.dot(do, hidden_states[n][T-1].T)
            db_out += do
            dh = np.dot(W.T, do)
            for t in reversed(range(T)):
                df = (1 - hidden_states[n][t]*hidden_states[n][t]) * dh
                db_hidden += df
                dU += np.dot(df, inputs[n][t].T)
                dV += np.dot(df, hidden_states[n][t-1].T)
                dh = np.dot(V.T, df)
                
        loss /= N
        dV, dU, dW, db_hidden, db_out = dV/N, dU/N, dW/N, db_hidden/N, db_out/N
        grads = [dV, dU, dW, db_hidden, db_out]
        grads = gradient_clip(grads, clip_norm)
        return loss, grads

    def train(self, train_set, valid_set, hidden_state, num_epochs, lr, clip_norm, train_steps=100, val_steps=100):
        training_loss, validation_loss = [], []
        for epoch in range(num_epochs):
            epoch_training_loss = 0.0
            epoch_validation_loss = 0.0
            min_valid_loss = 1e6

            for train_step in range(train_steps):
                inputs, targets = next(train_set)
                outputs, hidden_states = self.forward(inputs, hidden_state)
                loss, grads = self.backward(inputs, outputs, hidden_states, targets, clip_norm)
                self.params = update_parameters(self.params, grads, lr)
                epoch_training_loss += loss

            for val_step in range(val_steps):
                inputs, targets = next(valid_set)
                outputs, _ = self.forward(inputs, hidden_state)
                loss = find_loss(outputs, targets)
                if loss < min_valid_loss:
                    self.best = self.params.copy()
                    min_valid_loss = loss
                epoch_validation_loss += loss

            training_loss.append(epoch_training_loss/train_steps)
            validation_loss.append(epoch_validation_loss/val_steps)

            if epoch % 1 == 0:
                print(f'Epoch {epoch}, training loss: {training_loss[-1]}, validation loss: {validation_loss[-1]}')
        return training_loss, validation_loss

    def test(self, test_set, hidden_state, test_steps=100):
        test_loss, test_acc = 0.0, 0.0
        for test_step in range(test_steps):
            inputs, targets = next(test_set)
            outputs, _ = self.forward(inputs, hidden_state, best=True)
            test_loss += find_loss(outputs, targets)
            test_acc += find_acc(outputs, targets)
        test_loss /= test_steps
        test_acc /= test_steps
        print("Test loss: ", test_loss)
        print("Test acc: ", test_acc)
        return test_loss, test_acc

In [310]:
### Train Vanilla RNN ###
# Hyperparameters
hidden_size = 16
gradient_clip_norm = 1.0
learning_rate = 0.1
num_epochs = 30

# experiments with Vanilla RNN
num_exp = 3 # number of experiments
test_loss_results, test_acc_results = [], []
for n in range(num_exp):
    print('Experiment:', n)
    rnn_model = RNN(hidden_size=hidden_size, input_size=50, out_size=1)
    hidden_state = np.zeros((hidden_size, 1))  # Initial hidden_state
    train_loss, valid_loss = rnn_model.train(train_set=train_gen, valid_set=val_gen, 
                                            hidden_state=hidden_state, num_epochs=num_epochs, 
                                            lr=learning_rate, clip_norm=gradient_clip_norm, 
                                            train_steps=train_steps, val_steps=val_steps)
    test_loss, test_acc = rnn_model.test(test_set=test_gen, hidden_state=hidden_state, test_steps=test_steps)
    test_loss_results.append(test_loss)
    test_acc_results.append(test_acc)
    # Plot training and validation loss
    epoch = np.arange(len(train_loss))
    plt.figure()
    plt.plot(epoch, train_loss, 'r', label='Training loss',)
    plt.plot(epoch, valid_loss, 'b', label='Validation loss')
    plt.legend()
    plt.xlabel('Epoch'), plt.ylabel('Loss')
    plt.savefig(f'imdb_{seq_len}_exp_{n}_rnn.png')

Experiment: 0
Epoch 0, training loss: 0.7041292843009869, validation loss: 0.6956561066268989
Epoch 1, training loss: 0.6943755685579942, validation loss: 0.6955373235939893
Epoch 2, training loss: 0.6943560163845611, validation loss: 0.695539225683255
Epoch 3, training loss: 0.6943441671218752, validation loss: 0.6955415209174542
Epoch 4, training loss: 0.6943325361760658, validation loss: 0.695543828124069
Epoch 5, training loss: 0.6943210907143421, validation loss: 0.6955461438939571
Epoch 6, training loss: 0.6943098218157912, validation loss: 0.695548466266291
Epoch 7, training loss: 0.694298721134911, validation loss: 0.6955507934941293
Epoch 8, training loss: 0.6942877807944864, validation loss: 0.6955531240259145
Epoch 9, training loss: 0.694276993355988, validation loss: 0.6955554564921139
Epoch 10, training loss: 0.6942663517921929, validation loss: 0.6955577896929163
Epoch 11, training loss: 0.6942558494616208, validation loss: 0.6955601225868572
Epoch 12, training loss: 0.69

KeyboardInterrupt: ignored

In [None]:
print(f'Average imdb test loss: seq_len: {seq_len}, {np.average(test_loss_results)}')
print(f'Average imdb test acc: seq_len: {seq_len}, {np.average(test_acc_results)}')

Average imdb test loss: seq_len: 50, 0.7562751892290948
Average imdb test acc: seq_len: 50, 0.56875


## HORNN

In [None]:
class HORNN():
    def __init__(self, hidden_size, input_size, out_size, order):
        self.order = order
        V = {}
        for i in range(self.order):
            V[-i-1] = np.random.randn(hidden_size, hidden_size)*1 # W_hh
        U = np.random.randn(hidden_size, input_size)*1            # W_hx
        W = np.random.randn(out_size, hidden_size)*1              # W_hy
        b_hidden = np.zeros((hidden_size, 1))                        # b_h
        b_out = np.zeros((out_size, 1))                              # b_y
        
        self.params = [V, U, W, b_hidden, b_out]
        self.best = [V, U, W, b_hidden, b_out]

    def forward(self, inputs, hidden_state, best=False):
        if best: 
            V, U, W, b_hidden, b_out = self.best    
        else: 
            V, U, W, b_hidden, b_out = self.params
        outputs, hidden_states = {}, {}
        for n in range(len(inputs)):
            hidden_states[n] = {}
            for i in range(self.order): # for the history window
                hidden_states[n][-i-1] = np.copy(hidden_state)
            for t in range(len(inputs[n])):
                s = np.zeros_like(hidden_state)
                for i in range(self.order): # order = 3: -1, -2, -3; 
                    s += np.dot(V[-i-1], hidden_states[n][t-i-1])
                hidden_states[n][t] = np.tanh(np.dot(U, inputs[n][t]) + s + b_hidden)
            outputs[n] = sigmoid(np.copy((np.dot(W, hidden_states[n][t]) + b_out).item()))
        return outputs, hidden_states
    
    def backward(self, inputs, outputs, hidden_states, targets, clip_norm):
        V, U, W, b_hidden, b_out = self.params
        dV = {}
        for i in range(self.order):
            dV[-i-1] = np.zeros_like(V[-i-1])
        dU, dW = np.zeros_like(U), np.zeros_like(W)
        db_hidden, db_out = np.zeros_like(b_hidden), np.zeros_like(b_out)

        loss = 0.0
        N, T = len(inputs), len(inputs[0])

        for n in range(N):
            loss += -(targets[n]*np.log(outputs[n]) + (1 - targets[n])*np.log(1 - outputs[n]))
            do = outputs[n] - targets[n]
            dW += np.dot(do, hidden_states[n][T-1].T)    # uses only last hidden state
            db_out += do

            # HORNN update
            #########################
            arrayOfM, arrayOfs = {}, {}
            for i in range(self.order):
                arrayOfM[i], arrayOfs[i] = 0.0, 0.0

            l = np.dot(W.T, do)
            one = 1.0
            for t in reversed(range(T)):
                M = arrayOfM[0]
                for i in range(self.order-1):
                    M = M * arrayOfs[i] + arrayOfM[i+1]
                
                M = M * arrayOfs[self.order-1] + one  # M update, one = 1 only at t + T
                M *= l                                # l is 1 after t = T
                for i in range(self.order-1):
                    arrayOfM[i] = arrayOfM[i+1]
                arrayOfM[self.order-1] = M

                df = (1 - hidden_states[n][t] * hidden_states[n][t])
                r = df * M
                for i in range(self.order):
                    dV[-i-1] += np.dot(r, hidden_states[n][t-i-1].T)
                db_hidden += r
                dU += np.dot(r, inputs[n][t].T)

                for s in range(self.order-1):                   
                    arrayOfs[s] = arrayOfs[s+1] 
                arrayOfs[self.order-1] = np.dot(V[-1].T, df)
                
                one, l = 0.0, 1.0
            #########################

        loss /= N
        for i in range(self.order):
            dV[-i-1] /= N
        dU, dW, db_hidden, db_out = dU/N, dW/N, db_hidden/N, db_out/N
        grads = [dV, dU, dW, db_hidden, db_out]
        grads = gradient_clip(grads, clip_norm, self.order)
        return loss, grads
    
    def train(self, train_set, valid_set, hidden_state, num_epochs, lr, clip_norm, train_steps=100, val_steps=100):
        training_loss, validation_loss = [], []
        for epoch in range(num_epochs):
            epoch_training_loss = 0.0
            epoch_validation_loss = 0.0
            min_valid_loss = 1e6

            for train_step in range(train_steps):
                inputs, targets = next(train_set)
                outputs, hidden_states = self.forward(inputs, hidden_state)
                loss, grads = self.backward(inputs, outputs, hidden_states, targets, clip_norm)
                self.params = update_parameters(self.params, grads, lr, self.order)
                epoch_training_loss += loss

            for val_step in range(val_steps):
                inputs, targets = next(valid_set)
                outputs, _ = self.forward(inputs, hidden_state)
                loss = find_loss(outputs, targets)
                if loss < min_valid_loss:
                    self.best = self.params.copy()
                    min_valid_loss = loss
                epoch_validation_loss += loss

            training_loss.append(epoch_training_loss/train_steps)
            validation_loss.append(epoch_validation_loss/val_steps)

            if epoch % 2 == 0:
                print(f'Epoch {epoch}, training loss: {training_loss[-1]}, validation loss: {validation_loss[-1]}')
        return training_loss, validation_loss

    def test(self, test_set, hidden_state, test_steps=100):
        test_loss, test_acc = 0.0, 0.0
        for test_step in range(test_steps):
            inputs, targets = next(test_set)
            outputs, _ = self.forward(inputs, hidden_state, best=True)
            test_loss += find_loss(outputs, targets)
            test_acc += find_acc(outputs, targets)
        test_loss /= test_steps
        test_acc /= test_steps
        print("Test loss: ", test_loss)
        print("Test acc: ", test_acc)
        return test_loss, test_acc

## Train HORNN

In [None]:
### Train HORNN ###
# Hyperparameters
hidden_size = 16
gradient_clip_norm = 0.1
learning_rate = 0.01
num_epochs = 10
num_exp = 5
b_size = 64
train_steps, val_steps, test_steps = 5, 2, 2
orders = [2, 3, 5]
seq_lens = [50, 100]

In [None]:
hornn_test_results = {}
for seq_len in seq_lens:
    trainx, trainy = filter(train_data, train_labels, seqlen)
    x_train = sequence.pad_sequences(trainx, maxlen=seqlen)
    y_train = np.asarray(trainy).astype('float32')

    testx, testy = filter(test_data, test_labels, seqlen)
    x_test = sequence.pad_sequences(testx, maxlen=seqlen)
    y_test = np.asarray(testy).astype('float32')

    train_gen = data_generator(x_train[:b_size*train_steps], 
                        y_train[:b_size*train_steps], batch_size=b_size)
    val_gen = data_generator(x_test[:b_size*val_steps], 
                    y_test[:b_size*val_steps], batch_size=b_size)
    test_gen = data_generator(x_test[b_size*val_steps:b_size*(val_steps+test_steps)], 
                        y_test[b_size*val_steps:b_size*(val_steps+test_steps)], batch_size=b_size)

    hornn_test_results[seq_len] = {}
    for order in orders:
        hornn_test_results[seq_len][order] = {}
        test_loss_results, test_acc_results = [], []
        for n in range(num_exp):
            print(f'HORNN {order} Experiment: {n}; seq_len: {seq_len}')
            hornn_model = HORNN(hidden_size=hidden_size, input_size=10000, out_size=1, order=order)
            hidden_state = np.zeros((hidden_size, 1))  # Initial hidden_state
            hornn_train_loss, hornn_valid_loss = hornn_model.train(train_set=train_gen, valid_set=val_gen, 
                                                    hidden_state=hidden_state, num_epochs=num_epochs, 
                                                    lr=learning_rate, clip_norm=gradient_clip_norm, 
                                                    train_steps=train_steps, val_steps=val_steps)
            hornn_test_loss, hornn_test_acc = hornn_model.test(test_set=test_gen, hidden_state=hidden_state, test_steps=test_steps)
            test_loss_results.append(test_loss)
            test_acc_results.append(test_acc)
            # Plot training and validation loss
            epoch = np.arange(len(train_loss))
            plt.figure()
            plt.plot(epoch, train_loss, 'r', label='Training loss',)
            plt.plot(epoch, valid_loss, 'b', label='Validation loss')
            plt.legend()
            plt.xlabel('Epoch'), plt.ylabel('Loss')
            plt.savefig(f'imdb_{seq_len}__hornn{order}_exp_{n}.png')
        
        print(f'Seq_len: {seq_len}; HORNN-{order} average imdb test loss: {np.average(test_loss_results)}')
        print(f'Seq_len: {seq_len}; HORNN-{order} average imdb test acc: {np.average(test_acc_results)}')
        hornn_test_results[seq_len][order]['loss'] = np.average(test_loss_results)
        hornn_test_results[seq_len][order]['acc'] = np.average(test_acc_results)
        hornn_test_results[seq_len][order]['all_loss'] = test_loss_results
        hornn_test_results[seq_len][order]['all_acc'] = test_acc_results