# RNN Recurrent Neural Networks
### From scratch

FFNN

$ \large h_t = \sigma_h(W_h x_t + b_h) $  
$ \large o_t = \sigma_h(W_o h_t + b_o) $

TDNN

$ \large h_t = \sigma_h(W_h x_t + V x_{t-1} + b_h) $  
$ \large o_t = \sigma_h(W_o h_t + b_o) $

Elman RNN

$ \large h_t = \sigma_h(W_h x_t + V h_{t-1} + b_h) $  
$ \large o_t = \sigma_h(W_o h_t + b_o) $

Jordan RNN

$ \large h_t = \sigma_h(W_h x_t + V o_{t-1} + b_h) $  
$ \large o_t = \sigma_h(W_o h_t + b_o) $

## Imports

In [9]:
import numpy as np
import csv
import numpy as np
import itertools
from datetime import datetime
import sys
import nltk
#nltk.download('punkt')

## 1. Activation functions

$ \large s = \frac{1}{1 + e^{-x}} \qquad t = \frac{e^x - e^{-x}}{e^x + e^{-x}}$

In [10]:
class Sigmoid:

    def forward(self, x): 
        return 1.0 / (1.0 + np.exp(-x))

    def backward(self, x, topDiff):
        output = self.forward(x)
        return (1.0 - output) * output * topDiff

class Tanh:

    def forward(self, x):
        return np.tanh(x)

    def backward(self, x, topDiff):
        output = self.forward(x)
        return (1.0 - np.square(output)) * topDiff

## 2. Gates

In [11]:
class AddGate:

    def forward(self, x1, x2):
        return x1 + x2

    def backward(self, x1, x2, dz):
        dx1 = dz * np.ones_like(x1)
        dx2 = dz * np.ones_like(x2)
        return dx1, dx2

class MultiplyGate:

    def forward(self,W, x):
        return np.dot(W, x)

    def backward(self, W, x, dz):
        dW = np.asarray(np.dot(np.transpose(np.asmatrix(dz)), np.asmatrix(x)))
        dx = np.dot(np.transpose(W), dz)
        return dW, dx

## 3. Layers

In [20]:
class Layer:

    def __init__(self):
        self.mulGate = MultiplyGate()
        self.addGate = AddGate()
        self.activation = Tanh()

    def forward(self, x, prev, U, W, V):
        self.mulu = self.mulGate.forward(U, x)
        self.mulw = self.mulGate.forward(W, prev)
        self.add = self.addGate.forward(self.mulw, self.mulu)
        self.s = self.activation.forward(self.add)
        self.mulv = self.mulGate.forward(V, self.s)

    def backward(self, x, prev, U, W, V, diff, dmulv):
        self.forward(x, prev, U, W, V)
        dV, dsv = self.mulGate.backward(V, self.s, dmulv)
        ds = dsv + diff
        dadd = self.activation.backward(self.add, ds)
        dmulw, dmulu = self.addGate.backward(self.mulw, self.mulu, dadd)
        dW, dprev = self.mulGate.backward(W, prev, dmulw)
        dU, dx = self.mulGate.backward(U, x, dmulu)
        return (dprev, dU, dW, dV)

## 4. Output

In [21]:
class Softmax:

    def predict(self, x):
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores)

    def loss(self, x, y):
        probs = self.predict(x)
        return -np.log(probs[y])

    def diff(self, x, y):
        probs = self.predict(x)
        probs[y] -= 1.0
        return probs

## 5. RNN

In [22]:
class RNN:

    def __init__(self, wordDim, nHid=100, bpttTrunc=4):
        self.wordDim = wordDim
        self.nHid = nHid
        self.bpttTrunc = bpttTrunc
        self.U = np.random.uniform(-np.sqrt(1. / wordDim), np.sqrt(1. / wordDim), (nHid, wordDim))
        self.W = np.random.uniform(-np.sqrt(1. / nHid), np.sqrt(1. / nHid), (nHid, nHid))
        self.V = np.random.uniform(-np.sqrt(1. / nHid), np.sqrt(1. / nHid), (wordDim, nHid))
     
    # Forward propagation (predicting word probabilities) i.e. x = [0, 179, 341, 416], then its y = [179, 341, 416, 1]
    def Forward(self, x):
        T = len(x) # total number of time steps
        layers = []
        prev = np.zeros(self.nHid)
        for t in range(T):
            layer = Layer()
            input = np.zeros(self.wordDim)
            input[x[t]] = 1
            layer.forward(input, prev, self.U, self.W, self.V)
            prev = layer.s
            layers.append(layer)
        return layers

    def Predict(self, x):
        output = Softmax()
        layers = self.Forward(x)
        return [np.argmax(output.predict(layer.mulv)) for layer in layers]

    def CalcLoss(self, x, y):
        assert len(x) == len(y)
        output = Softmax()
        layers = self.Forward(x)
        loss = 0.0
        for i, layer in enumerate(layers):
            loss += output.loss(layer.mulv, y[i])
        return loss / float(len(y))

    def CalcTotalLoss(self, X, Y):
        loss = 0.0
        for i in range(len(Y)):
            loss += self.CalcLoss(X[i], Y[i])
        return loss / float(len(Y))

    def Bptt(self, x, y):
        assert len(x) == len(y)
        output = Softmax()
        layers = self.Forward(x)
        dU = np.zeros(self.U.shape)
        dV = np.zeros(self.V.shape)
        dW = np.zeros(self.W.shape)

        T = len(layers)
        prevT = np.zeros(self.nHid)
        diff = np.zeros(self.nHid)
        for t in range(0, T):
            dmulv = output.diff(layers[t].mulv, y[t])
            input = np.zeros(self.wordDim)
            input[x[t]] = 1
            dprev, dU_t, dW_t, dV_t = layers[t].backward(input, prevT, self.U, self.W, self.V, diff, dmulv)
            prevT = layers[t].s
            dmulv = np.zeros(self.wordDim)
            for i in range(t-1, max(-1, t-self.bpttTrunc-1), -1):
                input = np.zeros(self.wordDim)
                input[x[i]] = 1
                prevI = np.zeros(self.nHid) if i == 0 else layers[i-1].s
                dprev, dU_i, dW_i, dV_i = layers[i].backward(input, prevI, self.U, self.W, self.V, dprev, dmulv)
                dU_t += dU_i
                dW_t += dW_i
            dV += dV_t
            dU += dU_t
            dW += dW_t
        return (dU, dW, dV)

    def SgdStep(self, x, y, m):
        dU, dW, dV = self.Bptt(x, y)
        self.U -= m * dU
        self.V -= m * dV
        self.W -= m * dW

    def Train(self, X, Y, m=0.005, epochs=100, evalLoss=5):
        nExamples = 0
        losses = []
        for epoch in range(epochs):
            if (epoch % evalLoss == 0):
                loss = self.CalcTotalLoss(X, Y)
                losses.append((nExamples, loss))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("%s: Loss after n examples = %d epoch=%d: %f" % (time, nExamples, epoch, loss))
                # Adjust the learning rate if loss increases
                if len(losses) > 1 and losses[-1][1] > losses[-2][1]:
                    m = m * 0.5
                    print("Setting learning rate to %f" % m)
                sys.stdout.flush()
            # For each training example...
            for i in range(len(Y)):
                self.SgdStep(X[i], Y[i], m)
                nExamples += 1
        return losses

## Testing : Preprocess

In [23]:
def GetSentenceData(path, vocabDim=8000):
    unknown_token = "UNKNOWN_TOKEN"
    sentence_start_token = "SENTENCE_START"
    sentence_end_token = "SENTENCE_END"

    # Read the data and append SENTENCE_START and SENTENCE_END tokens
    print("Reading CSV file...")
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, skipinitialspace=True)
        # Split full comments into sentences
        sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    print("Parsed %d sentences." % (len(sentences)))

    # Tokenize the sentences into words
    tokenizedSentences = [nltk.word_tokenize(sent) for sent in sentences]
    # Filter the sentences having few words (including SENTENCE_START and SENTENCE_END)
    tokenizedSentences = list(filter(lambda x: len(x) > 3, tokenizedSentences))

    # Count the word frequencies
    wordFreq = nltk.FreqDist(itertools.chain(*tokenizedSentences))
    print("Found %d unique words tokens." % len(wordFreq.items()))

    # Get the most common words and build index2word and word2index vectors
    vocab = wordFreq.most_common(vocabDim-1)
    index2word = [x[0] for x in vocab]
    index2word.append(unknown_token)
    word2index = dict([(w,i) for i,w in enumerate(index2word)])

    print("Using vocabulary size %d." % vocabDim)
    print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

    # Replace all words not in our vocabulary with the unknown token
    for i, sent in enumerate(tokenizedSentences):
        tokenizedSentences[i] = [w if w in word2index else unknown_token for w in sent]

    print("\nExample sentence: '%s'" % sentences[1])
    print("\nExample sentence after Pre-processing: '%s'\n" % tokenizedSentences[0])

    # Create the training data
    XTrain = np.asarray([[word2index[w] for w in sent[:-1]] for sent in tokenizedSentences])
    yTrain = np.asarray([[word2index[w] for w in sent[1:]] for sent in tokenizedSentences])

    print("XTrain shape: " + str(XTrain.shape))
    print("yTrain shape: " + str(yTrain.shape))

    # Print a training data example
    xExample, yExample = XTrain[17], yTrain[17]
    print("x:\n%s\n%s" % (" ".join([index2word[x] for x in xExample]), xExample))
    print("\ny:\n%s\n%s" % (" ".join([index2word[x] for x in yExample]), yExample))

    return XTrain, yTrain

## Testing : run

In [24]:
wordDim = 1000
nHid = 100
XTrain, yTrain = GetSentenceData('D:/data/csv/reddit-comments.csv', wordDim)

np.random.seed(10)
rnn = RNN(wordDim, nHid)
losses = rnn.Train(XTrain[:100], yTrain[:100], m=0.005, epochs=5, evalLoss=1)

Reading CSV file...
Parsed 79171 sentences.
Found 65409 unique words tokens.
Using vocabulary size 1000.
The least frequent word in our vocabulary is 'america' and appeared 129 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'UNKNOWN_TOKEN', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'UNKNOWN_TOKEN', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'

XTrain shape: (78483,)
yTrain shape: (78483,)
x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 858, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 858, 54, 25, 34, 69, 1]
2019-11-22 10:21:57: Loss after n examples = 0 epoch=0: 6.906628
2019-11-22 10:22:10: Loss after n examples = 100 epoch=1: 6.807144
2019-11-22 10:22:21: Loss after n examples = 200 epoch