# RNN with Python


Nguồn: https://dominhhai.github.io/vi/2017/10/implement-rnn-with-python/

In [5]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Thang\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\Thang\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\Thang\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Thang\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\Thang\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\Thang\AppData\Roaming\nltk_data...
[nltk_data] 

True

In [18]:

vocabulary_size = 8000
unknown_token = "UNKONWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Read csv file...")
with open('./reddit-comments-2015-08.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f, skipinitialspace=True)
    next(reader)
    # split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s"%(sentence_start_token, x, sentence_end_token) for x in sentences]
print("parsed %d sentences"%len(sentences))

# tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found % unique words tokens"% len(word_freq.items()))

# get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i, w in enumerate(index_to_word)])

print("Using vocabulary size %d" % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times"%(vocab[-1][0], vocab[-1][1]))

# replace all words not in our vocabulary with the unkonwn token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print("Example sentence: %s"%sentences[0])
print("Example sentence after pre-processing: %s" % tokenized_sentences[0])

# create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

Read csv file...
parsed 79170 sentences
Found  65499nique words tokens
Using vocabulary size 8000
The least frequent word in our vocabulary is 'documentary' and appeared 10 times
Example sentence: SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END
Example sentence after pre-processing: ['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']


In [21]:
X_train

array([list([0, 6, 3494, 7, 155, 795, 25, 222, 8, 32, 20, 202, 4954, 350, 91, 6, 66, 207, 5, 2]),
       list([0, 11, 17, 7, 3094, 5974, 7999, 7999, 5974, 2]),
       list([0, 988, 1478, 226, 597, 15, 776, 3410, 2957, 4, 7999, 597, 471, 5975, 4, 491, 597, 471, 5976, 2702, 4, 8, 71, 5681, 15, 7999, 7999, 2]),
       ...,
       list([0, 7999, 4, 41, 7999, 4, 13, 63, 9, 152, 757, 7999, 57, 3, 7999, 12, 97, 16, 619, 67, 11, 109, 20, 2]),
       list([0, 38, 144, 3585, 24, 7999, 7999, 7999, 8, 1052, 564, 7999, 7999, 7999, 7999, 2]),
       list([0, 3, 4287, 19, 7999, 18, 174, 12, 232, 74, 101, 1292, 14, 24, 161, 8, 12, 6, 160, 16, 131, 3, 564, 68, 11, 17, 790, 5, 26, 7999, 2])],
      dtype=object)

In [22]:
y_train

array([list([6, 3494, 7, 155, 795, 25, 222, 8, 32, 20, 202, 4954, 350, 91, 6, 66, 207, 5, 2, 1]),
       list([11, 17, 7, 3094, 5974, 7999, 7999, 5974, 2, 1]),
       list([988, 1478, 226, 597, 15, 776, 3410, 2957, 4, 7999, 597, 471, 5975, 4, 491, 597, 471, 5976, 2702, 4, 8, 71, 5681, 15, 7999, 7999, 2, 1]),
       ...,
       list([7999, 4, 41, 7999, 4, 13, 63, 9, 152, 757, 7999, 57, 3, 7999, 12, 97, 16, 619, 67, 11, 109, 20, 2, 1]),
       list([38, 144, 3585, 24, 7999, 7999, 7999, 8, 1052, 564, 7999, 7999, 7999, 7999, 2, 1]),
       list([3, 4287, 19, 7999, 18, 174, 12, 232, 74, 101, 1292, 14, 24, 161, 8, 12, 6, 160, 16, 131, 3, 564, 68, 11, 17, 790, 5, 26, 7999, 2, 1])],
      dtype=object)

In [56]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)


class RNNNumpy:
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, hidden_dim))

    def forward_propagation(self, x):
        # The total number of time steps
        T = len(x)
        # during forward propagation we save all hidden statess in s because nedd them later.
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T+1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # the outputs at each time step. Again, we save them for later
        o = np.zeros((T, self.word_dim))
        # for each time step
        for t in np.arange(T):
            # note that we are indxing U by x[t]. This is same as multiplying U wih a one-hot vector.
            s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
    
    def predict(self, x):
        # Perform forward propagation and return index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis=1)
    
    def calculate_total_loss(self, x, y):
        L = 0
        # for each sentence..
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            # We only care about our prediction of the "correct" words
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            # Add to the loss based on how off we were
            L += -1*np.sum(np.log(correct_word_predictions))
        return L
    
    def calculate_loss(self, x, y):
        # divide the total loss by the number of training examples
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
    
    def bptt(self, x, y):
        T = len(y)
        # perform forward propagation
        o, s = self.forward_propagation(x)
        # wr accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        
        delta_o = o
        delta_o[np.arange(len(y)), 1] -=1
        
        # for each ouput backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])              
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]
    
    
    def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
        # Calculate the gradients using backpropagation. We want to checker if these are correct.
        bptt_gradients = self.bptt(x, y)
        # List of all parameters we want to check.
        model_parameters = ['U', 'V', 'W']
        # Gradient check for each parameter
        for pidx, pname in enumerate(model_parameters):
            # Get the actual parameter value from the mode, e.g. model.W
            parameter = operator.attrgetter(pname)(self)
            print("Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)))
            # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
            it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
            while not it.finished:
                ix = it.multi_index
                # Save the original value so we can reset it later
                original_value = parameter[ix]
                # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
                parameter[ix] = original_value + h
                gradplus = self.calculate_total_loss([x],[y])
                parameter[ix] = original_value - h
                gradminus = self.calculate_total_loss([x],[y])
                estimated_gradient = (gradplus - gradminus)/(2*h)
                # Reset parameter to original value
                parameter[ix] = original_value
                # The gradient for this parameter calculated using backpropagation
                backprop_gradient = bptt_gradients[pidx][ix]
                # calculate The relative error: (|x - y|/(|x| + |y|))
                relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
                # If the error is to large fail the gradient check
                if relative_error > error_threshold:
                    print("Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix))
                    print("+h Loss: %f" % gradplus)
                    print("-h Loss: %f" % gradminus)
                    print("Estimated_gradient: %f" % estimated_gradient)
                    print("Backpropagation gradient: %f" % backprop_gradient)
                    print("Relative Error: %f" % relative_error)
                    return
                it.iternext()
            print("Gradient check for parameter %s passed." % (pname))
        
    def numpy_sdg_step(self, x, y, learning_rate):
        # Calculate the gradients
        dLdU, dLdV, dLdW = self.bptt(x, y)
        # Change parameters according to gradients and learning rate
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW
        
        
    # - model: The RNN model instance
    # - X_train: The training data set
    # - y_train: The training data labels
    # - learning_rate: Initial learning rate for SGD
    # - nepoch: Number of times to iterate through the complete dataset
    # - evaluate_loss_after: Evaluate the loss after this many epochs
    def train_with_sgd(self, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
        # We keep track of the losses so we can plot them later
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = self.calculate_loss(X_train, y_train)
                losses.append((num_examples_seen, loss))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("%s: Loss after num_examples_seen=%d epoch=%d: %f"%(time, num_examples_seen, epoch, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5
                    print("Setting learning rate to %f" % learning_rate)
                sys.stdout.flush()
            # For each training example...
            for i in range(len(y_train)):
                # One SGD step
                self.numpy_sdg_step(X_train[i], y_train[i], learning_rate)
                num_examples_seen += 1

In [36]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)

(45, 8000)
[[0.0001249  0.00012493 0.00012511 ... 0.00012502 0.00012499 0.00012501]
 [0.00012504 0.00012506 0.00012495 ... 0.00012499 0.00012495 0.00012495]
 [0.00012489 0.00012502 0.00012499 ... 0.00012498 0.00012509 0.00012505]
 ...
 [0.00012503 0.00012495 0.00012502 ... 0.00012496 0.00012503 0.0001251 ]
 [0.00012501 0.00012494 0.00012498 ... 0.00012497 0.00012503 0.00012499]
 [0.00012497 0.00012498 0.00012499 ... 0.00012501 0.00012503 0.00012508]]


In [37]:
predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions)

(45,)
[1284 2048 2594 2133 5068 6601 6559 4860 2212 6601 1581 3106 6892 5898
 5738 1712 6548 6164 4916 5898 1835 5145 5617 4665 6336 4265 7064  779
 1201 1835 3850 4048 2221 5898 4864 2182 1390 5898 3848 6821 4437 1528
 3943 5027 6862]


In [38]:
# Limit to 1000 examples to save time
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss:G %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197




Actual loss:G 8.987218


In [45]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.numpy_sdg_step(X_train[10], y_train[10], 0.005)

454 ms ± 18.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = model.train_with_sgd(X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)



2020-02-13 03:50:15: Loss after num_examples_seen=0 epoch=0: 8.987222
2020-02-13 03:50:40: Loss after num_examples_seen=100 epoch=1: 8.986882
2020-02-13 03:51:05: Loss after num_examples_seen=200 epoch=2: 8.983477
2020-02-13 03:51:30: Loss after num_examples_seen=300 epoch=3: 18.122906
Setting learning rate to 0.002500
2020-02-13 03:51:54: Loss after num_examples_seen=400 epoch=4: 21.880644
Setting learning rate to 0.001250


KeyboardInterrupt: 

# 2. Keras

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN, Input

model = Sequential()
model.add(SimpleRNN(800, return_sequences=True))
model.compile('rmsprop','categorical_crossentropy', metrics=['accuracy'])

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [2]:
model.summary()

ValueError: This model has not yet been built. Build the model first by calling build() or calling fit() with some data. Or specify input_shape or batch_input_shape in the first layer for automatic build. 

In [None]:
model.fit(X_train, y_train,batch_size=32,epochs=1000)


In [70]:
X_train.shape

(79170,)

In [83]:
y_train.shape

(79170,)

# RNN Tensorflow

Trong Tensorflow hỗ trợ các cell sau (trong class `tf.nn.rnn_cell`). Cell đề cập đến một đối tượng có đầu ra vô hướng, là bất kỳ thứ gì có trạng thái và thực hiện một số thao tác có ma trận đầu vào:
* **BasicRNNCell**: cell cơ bản nhất của RNN
* **RNNCell**: Đối tượng trừu tượng đại diện cho một cell RNN
* **BasicLSTMCell**: Cell mạng hồi quy LSTM cơ bản (https://arxiv.org/pdf/1409.2329.pdf)
* **LSTMCell**: Cell mạng hồi quy LSTM.
* **GRUCell**: Cell GRU

Đề xây dựng 1 cell, ta có thể làm như sau:

In [1]:
import tensorflow as tf
hidden_size = 50
cell = tf.nn.rnn_cell.GRUCell(hidden_size)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


Xây dựng một stack nhiều cell:

In [2]:
hidden_sizes = [4,5,3]
layers = [tf.nn.rnn_cell.GRUCell(size) for size in hidden_sizes]
celss = tf.nn.rnn_cell.MultiRNNCell(layers)

Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Ngoài ra, ta có:
* **tf.nn.dynamic_rnn**: sử dụng tf.While để xây dựng đồ thị động khi nó thực thi. Đồ thị được tạo nhanh hơn.
* **tf.nn.bidirectional_dynamic_rnn**: dynamic_rnn 2 chiều

Tiếp theo, chúng ta sẽ sử dụng Tensorflow để xây dựng một mạng RNN phân lớp ảnh MNIST. Mỗi ảnh có kích thước 28 * 28px. Vì vậy chúng ta sẽ xử lý 28 chuỗi, mỗi chuổi lại gồm 28 phần tử cho mỗi mẫu đưa vào.