# Introduction

This notebook discusses the use of recurrent neural networks for sequence modelling tasks and structured prediction problems. A recurrent network, to explain, has feedback connections that allow that state of a layer in the network at one time step to affect the state of the same layer at the next time step. 

In [6]:
import numpy as np
import nltk
import sys
import re
import string
import collections
from nltk.stem.snowball import SnowballStemmer

stopwords = nltk.corpus.stopwords.words('english')
tokenizer = nltk.load('tokenizers/punkt/english.pickle')
stemmer = SnowballStemmer('english')
min_len = 10

with open('data/sample.txt','r') as f:
    text = f.read().split('</doc>')    

# Remove url info and title from each document
docs = [re.sub("<.*>", "", doc) for doc in text]
# docs = [doc.split('\n')[3:] for doc in docs]
docs = [doc.split('\n') for doc in docs]
docs = [' '.join(doc) for doc in docs]

# Remove unicode from each document 
docs = [doc.decode('unicode_escape') for doc in docs]
docs = [doc.encode('ascii','ignore') for doc in docs]
docs = [tokenizer.tokenize(doc) for doc in docs]

# Join tokenized documents into a list of sentences
sen_list = [sen for doc in docs for sen in doc]
sen_list = [s.translate(None, string.punctuation) for s in sen_list]
sen_list = [s.translate(None, '1234567890') for s in sen_list]
sen_list = [nltk.word_tokenize(s.lower()) for s in sen_list]
sen_list = [s for s in sen_list if len(s) > min_len]
sen_list = [s+['.'] for s in sen_list]

# Build vocab from sentence list
def flatten(lst, acc):
    for item in lst:
        if type(item) == type([]):
            flatten(item, acc)
        else:
            acc.append(item)
    return acc

words = flatten(sen_list, [])

counts = collections.Counter()
counts.update(words)

vocab = sorted([x for x,y in counts.iteritems()])
vocab.append('UNK')

wrd_to_ind = {j:i for i,j in enumerate(vocab)}
ind_to_wrd = {i:j for i,j in enumerate(vocab)}

data = ['UNK' if w not in vocab else w for w in words]

In [11]:
def sequences(seqlen=10, iters=10):
    count = 0
    for i in range(iters):
        for _ in range(len(data)): 
            if len(data) - _ > 10:
                x_words = data[_:_+seqlen]
                y_words = data[_+1:_+seqlen+1]    
                yield x_words, y_words

In [12]:
class RNN(object):
    def __init__(self, vocab, dim, eps=0.05):
        
        # Randomly initialize the three weight matrices
        self.U = np.random.random((dim, len(vocab)))*eps*2-eps
        self.W = np.random.random((dim, dim))*eps*2-eps
        self.V = np.random.random((len(vocab), dim))*eps*2-eps
        self.xs, self.hs, self.ys = {}, {}, {}
        self.bh = np.zeros(dim)

        self.vocab = vocab
        
    def get_onehot(self, ind):
        onehot = np.zeros(len(self.vocab))
        onehot[ind] = 1
        return onehot

    @staticmethod
    def softmax(z):
        return np.exp(z) / np.sum(np.exp(z), axis=0)
        
    def get_activities(self, seq_in):
        self.hs[-1] = np.zeros(len(self.W))
        for t in range(len(seq_in)):
            self.xs[t] = self.get_onehot(seq_in[t])
            self.hs[t] = np.tanh(np.dot(self.U, self.xs[t])+np.dot(self.W, self.hs[t-1])+self.bh)
            self.ys[t] = self.softmax(np.dot(self.V, self.hs[t]))
    
    def train(self, rate=0.05):
        for i,o in sequences():
            xs = np.array([wrd_to_ind[wrd] for wrd in i])
            ts = np.array([wrd_to_ind[wrd] for wrd in o])
            
            self.get_activities(xs)
            
            U_grad = np.zeros_like(self.U)
            W_grad = np.zeros_like(self.W)
            V_grad = np.zeros_like(self.V)
            bh_grad = np.zeros_like(self.bh)
            
            h_grads = {}
            h_grads[len(ts)] = np.zeros(len(self.W))

            for _ in reversed(range(len(ts))):
                y_grad = self.ys[_] - self.get_onehot(ts[_])
                h_grads[_] = (np.dot(self.V.T, y_grad)+np.dot(self.W.T, h_grads[_+1]))
                h_grads[_] = h_grads[_] * (1 - self.hs[_]**2)

                U_grad += np.outer(h_grads[_], self.xs[_])
                W_grad += np.outer(h_grads[_+1], self.hs[_])
                V_grad += np.outer(y_grad, self.hs[_])
                bh_grad += h_grads[_]
                    
            grads = [U_grad, W_grad, V_grad, bh_grad]
            
            # Clip gradients to avoid explosions
            for _ in range(len(grads)):
                if np.linalg.norm(grads[_]) > 5:
                    grads[_] = 5 * grads[_] / np.linalg.norm(grads[_]) 

            U_grad = grads[0]
            W_grad = grads[1]
            V_grad = grads[2]
            bh_grad = grads[3]
                
            self.U += -rate * U_grad
            self.W += -rate * W_grad
            self.V += -rate * V_grad 
            self.bh += -rate * bh_grad
                
    def predict(self, start, steps=5):
        hs, ys = {}, {}
        output = []
        word = start
        hs[-1] = np.zeros(len(self.W))
        
        for _ in range(steps):
            ind = wrd_to_ind[word]
            hs[_] = np.tanh(np.dot(self.U, self.get_onehot(ind))+np.dot(self.W, hs[_-1])+self.bh)
            ys[_] = self.softmax(np.dot(self.V, hs[_]))
            next_word = ind_to_wrd[np.argmax(ys[_])]
            output.append(next_word)
            word = next_word
        return output

In [13]:
test = RNN(vocab, dim=1500)
test.train()

In [14]:
for word in vocab[:15]:
    print word
    print test.predict(word, steps=5)
    print ''

.
['as', 'an', 'antidogmatic', 'philosophy', 'anarchism']

a
['political', 'philosophy', 'that', 'advocates', 'stateless']

advocates
['stateless', 'societies', 'often', 'defined', 'as']

an
['antidogmatic', 'philosophy', 'anarchism', 'draws', 'on']

anarchism
['entails', 'opposing', 'authority', 'or', 'hierarchical']

and
['philosophy', 'anarchism', 'draws', 'on', 'many']

antidogmatic
['philosophy', 'anarchism', 'draws', 'on', 'many']

antistatism
['is', 'central', 'anarchism', 'entails', 'opposing']

as
['an', 'antidogmatic', 'philosophy', 'anarchism', 'draws']

associations
['.', 'while', 'antistatism', 'is', 'central']

authority
['or', 'hierarchical', 'organisation', 'in', 'the']

authors
['have', 'defined', 'as', 'more', 'specific']

based
['on', 'nonhierarchical', 'free', 'associations', '.']

but
['not', 'limited', 'to', 'the', 'state']

central
['anarchism', 'entails', 'opposing', 'authority', 'or']

