# Recurrent layers for variable-length data

In [12]:
import numpy as np
import re

## Preapare data

In [24]:
tokens = []

with open('../data/tasksv11/en/qa1_single-supporting-fact_train.txt') as f:
    raw = f.readlines()
    print(len(raw))
    
    for line in raw:
        line = re.sub('[^A-Za-z ]+', '', line)
        tokens.append(line.lower().split(' ')[1:])
        
print(tokens[:3])    

3000
[['john', 'went', 'to', 'the', 'hallway'], ['where', 'is', 'mary', 'bathroom'], ['daniel', 'went', 'back', 'to', 'the', 'hallway'], ['sandra', 'moved', 'to', 'the', 'garden'], ['where', 'is', 'daniel', 'hallway'], ['john', 'moved', 'to', 'the', 'office'], ['sandra', 'journeyed', 'to', 'the', 'bathroom'], ['where', 'is', 'daniel', 'hallway'], ['mary', 'moved', 'to', 'the', 'hallway'], ['daniel', 'travelled', 'to', 'the', 'office'], ['where', 'is', 'daniel', 'office'], ['john', 'went', 'back', 'to', 'the', 'garden'], ['john', 'moved', 'to', 'the', 'bedroom'], ['where', 'is', 'sandra', 'bathroom'], ['sandra', 'travelled', 'to', 'the', 'office'], ['sandra', 'went', 'to', 'the', 'bathroom'], ['where', 'is', 'sandra', 'bathroom'], ['mary', 'went', 'to', 'the', 'bedroom'], ['daniel', 'moved', 'to', 'the', 'hallway'], ['where', 'is', 'sandra', 'bathroom'], ['john', 'went', 'to', 'the', 'garden'], ['john', 'travelled', 'to', 'the', 'office'], ['where', 'is', 'sandra', 'bathroom'], ['daniel

In [21]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
        
vocab = list(vocab)

word2idx = {w: idx for (idx, w) in enumerate(vocab)}

def word2ids(sentence):
    return [word2idx[word] for word in sentence]

## Create model

In [44]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

embed_size = 10

embed = (np.random.rand(len(vocab), embed_size) - 0.5) * 0.1

recurrent = np.eye(embed_size)

start = np.zeros(embed_size)

decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1

one_hot = np.eye(len(vocab))

def predict(sent):
    layers = []
    layer = {'hidden': start}
    layers.append(layer)
    
    loss = 0
    
    preds = []
    for target_i in range(len(sent)):
        layer = {'pred': softmax(layers[-1]['hidden'].dot(decoder))}

        loss += -np.log(layer['pred'][sent[target_i]])
        
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        
        layers.append(layer)
        
    return layers, loss

def train(epochs=30000, alpha=0.001):
    global start, embed, decoder, recurrent
    
    for e in range(epochs):
        sent = word2ids(tokens[e % len(tokens)][1:])
        layers, loss = predict(sent)
        
        for layer_idx in reversed(range(len(layers))):
            layer = layers[layer_idx]
            target = sent[layer_idx - 1]
            
            if layer_idx > 0:
                layer['output_delta'] = layer['pred'] - one_hot[target]
                new_hidden_delta = layer['output_delta'].dot(decoder.T)
                
                if layer_idx == len(layers) - 1:
                    layer['hidden_delta'] = new_hidden_delta
                else:
                    layer['hidden_delta'] = new_hidden_delta + layers[layer_idx + 1]['hidden_delta'].dot(recurrent.T)
            else:
                layer['hidden_delta'] = layers[layer_idx + 1]['hidden_delta'].dot(recurrent.T)
                
        start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
        for layer_idx, layer in enumerate(layers[1:]):
            
            decoder -= np.outer(layers[layer_idx]['hidden'], layer['output_delta']) * alpha / float(len(sent))
            
            embed_idx = sent[layer_idx]
            embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * alpha / float(len(sent))
            
            recurrent -= np.outer(layers[layer_idx]['hidden'], layer['hidden_delta']) * alpha / float(len(sent))
        
        if e % 1000 == 0:
            print(f"Perplexity: {np.exp(loss / len(sent))}")
            
        if e % 5000 == 0:
            print_predictions(4)

In [45]:
train()

Perplexity: 19.038600445616606
['sandra', 'moved', 'to', 'the', 'garden']
Prev input: sandra       True: moved        Pred: moved
Prev input: moved        True: to           Pred: journeyed
Prev input: to           True: the          Pred: where
Prev input: the          True: garden       Pred: journeyed
Perplexity: 18.991749619738346
Perplexity: 19.031890278680923
Perplexity: 18.989474996283676
Perplexity: 18.877774741206785
Perplexity: 19.026465936879486
['sandra', 'moved', 'to', 'the', 'garden']
Prev input: sandra       True: moved        Pred: the
Prev input: moved        True: to           Pred: the
Prev input: to           True: the          Pred: the
Prev input: the          True: garden       Pred: the
Perplexity: 18.758754106268523
Perplexity: 18.33733233463158
Perplexity: 19.010110531575585
Perplexity: 16.7176578580386
Perplexity: 13.73486367452972
['sandra', 'moved', 'to', 'the', 'garden']
Prev input: sandra       True: moved        Pred: the
Prev input: moved        True: t

## Analyze results

In [46]:
def print_predictions(sent_index):
    l, _ = predict(word2ids(tokens[sent_index]))
    print(tokens[sent_index])
    for i, layer in enumerate(l[1:-1]):
        inpt = tokens[sent_index][i]
        true = tokens[sent_index][i + 1]
        pred = vocab[layer['pred'].argmax()]
        print(f"Prev input: {inpt:12} True: {true:12} Pred: {pred}")
        
print_predictions(4)

['sandra', 'moved', 'to', 'the', 'garden']
Prev input: sandra       True: moved        Pred: to
Prev input: moved        True: to           Pred: to
Prev input: to           True: the          Pred: the
Prev input: the          True: garden       Pred: bedroom
