# RECURRENT NEURAL NETWORKS
A notebook about RNN

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Tensorflow is required
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# Common imports
import re
from collections import defaultdict
import copy
import pandas as pd
import numpy as np
import os
PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# Where to save the figures
NOTE_ROOT_DIR = os.path.abspath('')
DATA_DIR = os.path.join(NOTE_ROOT_DIR, "data", "20news-bydate")
CHAPTER_ID = "01_rnn"
IMAGES_PATH = os.path.join(NOTE_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## 1 - Load the Data

### 1.1 - Generate the Dictionary

In [18]:
# Setup to build dictionary
def gen_data_and_vocab():
    def collect_data_from(parent_path, newsgroup_list, word_count=None):
        data = []
        for group_id, newsgroup in enumerate(newsgroup_list):
            dir_path = os.path.join(parent_path, newsgroup)
            files = [(filename, os.path.join(dir_path, filename)) for filename in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, filename))]
            files.sort()
            label = group_id
            print("Processing: {}-{}".format(group_id, newsgroup))
            
            for filename, filepath in files:
                with open(filepath) as f:
                    text = f.read().lower()
                    words = re.split("\W+", text)
                    if word_count is not None:
                        for word in words:
                            word_count[word] += 1
                    content = " ".join(words)
                    assert len(content.splitlines()) == 1
                    data.append(str(label) + "<fff>" + filename + "<fff>" + content)
        return data
    
    word_count = defaultdict(int)
    parts = [os.path.join(DATA_DIR, dir_name) for dir_name in os.listdir(DATA_DIR) if not os.path.isfile(os.path.join(DATA_DIR, dir_name))]
    
    train_path, test_path = (parts[0], parts[1]) if "train" in parts[0] else (parts[1], parts[0])
    
    newsgroup_list = [newsgroup for newsgroup in os.listdir(train_path)]
    newsgroup_list.sort()
    
    train_data = collect_data_from(
        parent_path=train_path,
        newsgroup_list=newsgroup_list,
        word_count=word_count
    )
    vocab = [word for word, freq in zip(word_count.keys(), word_count.values()) if freq > 10]
    vocab.sort()
    with open(os.path.join(DATA_DIR, "w2v", "vocab-raw.txt"), "w") as f:
        f.write("\n".join(vocab))
    
    test_data = collect_data_from(
        parent_path=test_path,
        newsgroup_list=newsgroup_list
    )
    with open(os.path.join(DATA_DIR, "w2v", "20news_train_raw.txt"), "w") as f:
        f.write("\n".join(train_data))
    with open(os.path.join(DATA_DIR, "w2v", "20news_test_raw.txt"), "w") as f:
        f.write("\n".join(test_data)) 

In [19]:
# Build a dictionary for the data
gen_data_and_vocab()

Processing: 0-alt.atheism
Processing: 1-comp.graphics
Processing: 2-comp.os.ms-windows.misc
Processing: 3-comp.sys.ibm.pc.hardware
Processing: 4-comp.sys.mac.hardware
Processing: 5-comp.windows.x
Processing: 6-misc.forsale
Processing: 7-rec.autos
Processing: 8-rec.motorcycles
Processing: 9-rec.sport.baseball
Processing: 10-rec.sport.hockey
Processing: 11-sci.crypt
Processing: 12-sci.electronics
Processing: 13-sci.med
Processing: 14-sci.space
Processing: 15-soc.religion.christian
Processing: 16-talk.politics.guns
Processing: 17-talk.politics.mideast
Processing: 18-talk.politics.misc
Processing: 19-talk.religion.misc
Processing: 0-alt.atheism
Processing: 1-comp.graphics
Processing: 2-comp.os.ms-windows.misc
Processing: 3-comp.sys.ibm.pc.hardware
Processing: 4-comp.sys.mac.hardware
Processing: 5-comp.windows.x
Processing: 6-misc.forsale
Processing: 7-rec.autos
Processing: 8-rec.motorcycles
Processing: 9-rec.sport.baseball
Processing: 10-rec.sport.hockey
Processing: 11-sci.crypt
Processing

### 1.2 - Encoding Data

In [20]:
# Setup to encode the data
unknown_ID = 0
padding_ID = 1
MAX_SENTENCE_LENGTH = 500

train_data_path = os.path.join(DATA_DIR, "w2v", "20news_train_raw.txt")
test_data_path = os.path.join(DATA_DIR, "w2v", "20news_test_raw.txt")
vocab_path = os.path.join(DATA_DIR, "w2v", "vocab-raw.txt")

def encode_data(data_path, vocab_path):
    with open(vocab_path) as f:
        vocab = dict([(word, word_ID + 2)
                      for word_ID, word in enumerate(f.read().splitlines())])
    with open(data_path) as f:
        documents = [(line.split('<fff>')[0], line.split('<fff>')[1], line.split('<fff>')[2])
                     for line in f.read().splitlines()]

    encoded_data = []
    for document in documents:
        label, doc_id, text = document
        words = text.split()[:MAX_SENTENCE_LENGTH]
        sentence_length = len(words)
        encoded_text = []
        for word in words:
            if word in vocab:
                encoded_text.append(str(vocab[word]))
            else:
                encoded_text.append(str(unknown_ID))

        if len(words) < MAX_SENTENCE_LENGTH:
            num_padding = MAX_SENTENCE_LENGTH - len(words)
            for _ in range(num_padding):
                encoded_text.append(str(padding_ID))

        encoded_data.append(str(label) + '<fff>' + str(doc_id) + '<fff>'
                            + str(sentence_length) + '<fff>' + ' '.join(encoded_text))

    dir_name = '\\'.join(data_path.split('\\')[:-1])
    file_name = '-'.join(data_path.split('\\')[-1].split('-')[:-1]) + '-encoded.txt'
    with open(dir_name + '\\' + file_name, 'w') as f:
        f.write('\n'.join(encoded_data))

In [21]:
# Encoding the data
encode_data(train_data_path, vocab_path)
encode_data(test_data_path, vocab_path)

## 2 - RNN

In [3]:
# Setup RNN training
from models import DataReader, RNN
loss_report = []
accuracy_report = []

def train_and_evaluate_RNN():
    with open(os.path.join(DATA_DIR, "w2v", "vocab-raw.txt")) as f:
      vocab_size = len(f.read().splitlines())

    tf.set_random_seed(2021)
    rnn = RNN(
      vocab_size=vocab_size,
      embedding_size=300,
      lstm_size=50,
      batch_size=50
    )
    predicted_labels, loss = rnn.build_graph()
    train_op = rnn.trainer(loss=loss, learning_rate=0.01)

    with tf.Session() as sess:
        train_data_reader = DataReader(
            data_path=os.path.join(DATA_DIR, "w2v", "20news_train_encoded.txt"),
            batch_size=50,
        )
        test_data_reader = DataReader(
            data_path=os.path.join(DATA_DIR, "w2v", "20news_train_encoded.txt"),
            batch_size=50,
        )
        step = 0
        MAX_STEP = 3000

        sess.run(tf.global_variables_initializer())
        while step < MAX_STEP:
            next_train_batch = train_data_reader.next_batch()
            train_data, train_labels, train_sentence_lengths, train_final_tokens = next_train_batch
            plabels_eval, loss_eval, _ = sess.run(
                [predicted_labels, loss, train_op],
                feed_dict={
                    rnn._data: train_data,
                    rnn._labels: train_labels,
                    rnn._sentence_lengths: train_sentence_lengths,
                    rnn._final_tokens: train_final_tokens
                }
            )
            step += 1
            if step % 20 == 0:
              loss_report.append(loss_eval)
              print('loss: {}'.format(loss_eval))
            if train_data_reader._batch_id == 0:
              num_true_preds = 0
              while True:
                next_test_batch = test_data_reader.next_batch()
                test_data, test_labels, test_sentence_lengths, test_final_tokens = next_test_batch
                test_plabels_eval = sess.run(
                    predicted_labels,
                    feed_dict={
                        rnn._data: test_data,
                        rnn._labels: test_labels,
                        rnn._sentence_lengths: test_sentence_lengths,
                        rnn._final_tokens: test_final_tokens
                    }
                )
                matches = np.equal(test_plabels_eval, test_labels)
                num_true_preds += np.sum(matches.astype(float))

                if test_data_reader._batch_id == 0:
                  break

              accuracy_report.append(num_true_preds * 100. / test_data_reader._size)        
              print('Epoch: {}'.format(train_data_reader._num_epoch))
              print('Accuracy on test data: {}'.format(num_true_preds * 100. / test_data_reader._size))


In [None]:
# Train RNN
train_and_evaluate_RNN()

Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

loss: 0.5393661260604858
loss: 5.971559047698975
loss: 2.3458948135375977
loss: 4.631789684295654
loss: 2.9802987575531006
loss: 3.5584280490875244
loss: 4.6480231285095215
Epoch: 1
Accuracy on test data: 8.346666666666666
loss: 3.0219240188598633
loss: 2.3509011268615723
loss: 2.212888717651367
loss: 2.0006773471832275
loss: 1.8756399154663086
loss: 1.7114988565444946
loss: 1.641830325126648
loss: 1.1005499362945557
Epoch: 2
Accuracy on test data: 90.48
