# RNN Architecture - Vanilla RNN

## Setup - Libraries, Packages, Embeddings, Paths

### Libraries 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import urllib.request
import zipfile 
from tqdm import tqdm 

### Glove Embeddings

In [2]:
embeddings_path = "./Embeddings"
def download_progress(block_num, block_size, total_size):
    if not hasattr(download_progress, "pbar"):
        download_progress.pbar = tqdm(total=total_size, unit="B", unit_scale=True)
    download_progress.pbar.update(block_size)

if not os.path.exists(embeddings_path):
    print(f"create directory to store pre-trained glove embeddings")
    os.makedirs(embeddings_path)
    print(f"download pre-trained Glove Embeddings")
    urllib.request.urlretrieve(
        "http://nlp.stanford.edu/data/glove.6B.zip",
        "./Embeddings/glove.6B.zip",
        download_progress,
    )
    print("unpack embeddings")
    with zipfile.ZipFile("./Embeddings/glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall("./Embeddings/")
    os.remove("./Embeddings/glove.6B.zip")
    
    print("embeddings download complete")

### Paths 

In [3]:
glove_6b_50_path = "./Embeddings/glove.6B.50d.txt"
train_data_path = "./Datasets/model_data/train_data.csv"
test_data_path = "./Datasets/model_data/test_data.csv"
clean_train_split_path = "./Datasets/clean_train_split/"
clean_test_split_path = "./Datasets/clean_test_split"

## Data

### Train Data

In [4]:
train_df = pd.read_csv(train_data_path)
train_df.head(10)

Unnamed: 0,text,decade,decade_label,book_title,book_id,paragraph_id,word_count
0,Produced by Gary R. Young THE SCHOOL FOR SCAND...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_000,210
1,"the works of Sheridan as he wrote them, I may ...",1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_001,210
2,he had been nineteen years endeavouring to sat...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_002,210
3,"That even you assist her fame to raise, Approv...",1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_003,210
4,and face-- Poets would study the immortal line...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_004,210
5,who the peril of her lips shall paint? Strip t...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_005,210
6,"might well be thought Prerogative in her, and ...",1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_006,210
7,th' acknowledged praise Has spread conviction ...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_007,210
8,LAST NIGHT LORD L. [Sips] WAS CAUGHT WITH LADY...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_008,210
9,he would through-- He'll fight--that's write--...,1770,0,The School for Scandal,1770_The_School_for_Scand,1770_1770_The_School_for_Scand_009,210


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84860 entries, 0 to 84859
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          84860 non-null  object
 1   decade        84860 non-null  int64 
 2   decade_label  84860 non-null  int64 
 3   book_title    84860 non-null  object
 4   book_id       84860 non-null  object
 5   paragraph_id  84860 non-null  object
 6   word_count    84860 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 4.5+ MB


### Test Data

In [6]:
test_df = pd.read_csv(test_data_path)
test_df.head(10)

Unnamed: 0,text,decade,decade_label,book_title,book_id,paragraph_id,word_count
0,An Inquiry into the Nature and Causes of the W...,1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__000,210
1,THE EXPENSE OF MAINTAINING THE NATIONAL CAPITA...,1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__001,210
2,"PRODUCE OF LAND, AS EITHER THE SOLE OR THE PRI...",1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__002,210
3,"Whatever be the soil, climate, or extent of te...",1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__003,210
4,of those who work; yet the produce of the whol...,1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__004,210
5,"of capital stock, of the manner in which it is...",1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__005,210
6,which some magnify the importance of that indu...,1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__006,210
7,"the expenses incumbent on the whole society, a...",1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__007,210
8,trifling manufactures which are destined to su...,1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__008,210
9,machinery employed in it (to the invention of ...,1770,0,An Inquiry into the Nature and Causes of the W...,1770_An_Inquiry_into_the_,1770_1770_An_Inquiry_into_the__009,210


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25538 entries, 0 to 25537
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          25538 non-null  object
 1   decade        25538 non-null  int64 
 2   decade_label  25538 non-null  int64 
 3   book_title    25538 non-null  object
 4   book_id       25538 non-null  object
 5   paragraph_id  25538 non-null  object
 6   word_count    25538 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 1.4+ MB


## Tokenization

### NLTK Tokenizer
This code was adapted from Professor Johan Boye's DD2417 assignment tokenizers

In [8]:
import nltk 
nltk.download("punkt_tab")
import numpy as np 
from collections import defaultdict

class HistoricalTextTokenizer:
    """
    All of this code is adapted from Professor Johan Boye's DD2417 assignment tokenizers 
    """
    def __init__(self):
        self.word2id = defaultdict(lambda: None)
        self.id2word = defaultdict(lambda: None)
        self.latest_new_word = -1 
        self.tokens_processed = 0 

        self.UNKNOWN = '<unk>'
        self.PADDING_WORD = '<pad>'

        self.get_word_id(self.PADDING_WORD)
        self.get_word_id(self.UNKNOWN)

    def get_word_id(self, word):
        word = word.lower()
        if word in self.word2id:
            return self.word2id[word]
        else:
            self.latest_new_word += 1
            self.id2word[self.latest_new_word] = word
            self.word2id[word] = self.latest_new_word
            return self.latest_new_word

    def process_files(self, file_or_dir):
        all_texts = []
        all_labels = []

        if os.path.isdir(file_or_dir):
            decade_dirs = sorted([d for d in os.listdir(file_or_dir) if os.path.isdir(os.path.join(file_or_dir, d))])
            for decade_dir in decade_dirs:
                decade_path = os.path.join(file_or_dir, decade_dir)
                decade = int(decade_dir)
                print(f"Processing decade: {decade}")
                text_files = sorted([f for f in os.listdir(decade_path) if f.endswith(".txt")])
                print(f"number of files in {decade} directory: {len(text_files)}")

                for file in text_files:
                    filepath = os.path.join(decade_path, file)
                    print(f"tokenize file {file}")
                    text, labels = self.process_file(filepath, decade)
                    all_texts.extend(text)
                    all_labels.extend(labels)
        else:
            texts, labels = self.process_file(file_or_dir, 0)
            all_texts.extend(texts)
            all_labels.extend(labels)

        return all_texts, all_labels
        # pass

    def process_file(self, filepath, decade):
        print(filepath)
        stream = open(filepath, mode="r", encoding="utf-8", errors="ignore")
        text = stream.read()
        stream.close()

        try:
            self.tokens = nltk.word_tokenize(text)
        except LookupError:
            nltk.download("punkt")
            self.tokens = nltk.word_tokenize(text)

        for i, token in enumerate(self.tokens):
            self.tokens_processed += 1
            word_id = self.get_word_id(token)

            if self.tokens_processed % 10000 == 0:
                print("Processed", "{:,}".format(self.tokens_processed), "tokens")

        paragraphs = self.create_paragraphs(text)
        labels = [decade] * len(paragraphs)

        return paragraphs, labels
        # pass

    def create_paragraphs(self, text, min_words=10, max_words=210):
        words = text.split()
        paragraphs = []
        start = 0

        while start < len(words):
            end = min(start + max_words, len(words))
            paragraph_words = words[start:end]
            if len(paragraph_words) >= min_words:
                paragraph_text = " ".join(paragraph_words)
                paragraphs.append(paragraph_text)
            start = end

        return paragraphs 
        # pass

    def tokenize_text_to_id(self, text):
        try:
            tokens = nltk.word_tokenize(text.lower())
        except LookupError:
            nltk.download("punkt")
            tokens = nltk.word_tokenize(text.lower())
        word_ids = []
        for token in tokens:
            if token in self.word2id:
                word_ids.append(self.word2id[token])
            else:
                word_ids.append(self.word2id[self.UNKNOWN])
        return word_ids

        # pass

    def pad_sequence_to_length(self, word_ids, max_length=220):
        padding_id = self.word2id[self.PADDING_WORD]
        if len(word_ids) > max_length:
            word_ids = word_ids[:max_length]

        while len(word_ids) < max_length:
            word_ids.append(padding_id)
        return word_ids
        # pass

    def get_vocab_size(self):
        return len(self.word2id)
        # pass

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/pranavrajan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Tokenize Data 

In [9]:
text_tokenizer = HistoricalTextTokenizer()

#### Training Data

In [10]:
train_text_data, train_labels = text_tokenizer.process_files(clean_train_split_path)

Processing decade: 1770
number of files in 1770 directory: 9
tokenize file book_03.txt
./Datasets/clean_train_split/1770/book_03.txt
Processed 10,000 tokens
Processed 20,000 tokens
Processed 30,000 tokens
tokenize file book_04.txt
./Datasets/clean_train_split/1770/book_04.txt
Processed 40,000 tokens
Processed 50,000 tokens
Processed 60,000 tokens
Processed 70,000 tokens
Processed 80,000 tokens
Processed 90,000 tokens
Processed 100,000 tokens
Processed 110,000 tokens
Processed 120,000 tokens
Processed 130,000 tokens
Processed 140,000 tokens
Processed 150,000 tokens
Processed 160,000 tokens
Processed 170,000 tokens
Processed 180,000 tokens
Processed 190,000 tokens
Processed 200,000 tokens
Processed 210,000 tokens
tokenize file book_05.txt
./Datasets/clean_train_split/1770/book_05.txt
Processed 220,000 tokens
Processed 230,000 tokens
Processed 240,000 tokens
Processed 250,000 tokens
Processed 260,000 tokens
Processed 270,000 tokens
Processed 280,000 tokens
Processed 290,000 tokens
Process

#### Testing Data

In [11]:
test_text_data, test_labels = text_tokenizer.process_files(clean_test_split_path)

Processing decade: 1770
number of files in 1770 directory: 3
tokenize file book_01.txt
./Datasets/clean_test_split/1770/book_01.txt
Processed 21,430,000 tokens
Processed 21,440,000 tokens
Processed 21,450,000 tokens
Processed 21,460,000 tokens
Processed 21,470,000 tokens
Processed 21,480,000 tokens
Processed 21,490,000 tokens
Processed 21,500,000 tokens
Processed 21,510,000 tokens
Processed 21,520,000 tokens
Processed 21,530,000 tokens
Processed 21,540,000 tokens
Processed 21,550,000 tokens
Processed 21,560,000 tokens
Processed 21,570,000 tokens
Processed 21,580,000 tokens
Processed 21,590,000 tokens
Processed 21,600,000 tokens
Processed 21,610,000 tokens
Processed 21,620,000 tokens
Processed 21,630,000 tokens
Processed 21,640,000 tokens
Processed 21,650,000 tokens
Processed 21,660,000 tokens
Processed 21,670,000 tokens
Processed 21,680,000 tokens
Processed 21,690,000 tokens
Processed 21,700,000 tokens
Processed 21,710,000 tokens
Processed 21,720,000 tokens
Processed 21,730,000 tokens


### Create Labels

In [12]:
labels = sorted(set(train_labels + test_labels))
decade2label = {decade: i for i,decade in enumerate(labels)}
print(f"{decade2label}") 

{1770: 0, 1780: 1, 1790: 2, 1800: 3, 1810: 4, 1820: 5, 1830: 6, 1840: 7, 1850: 8, 1860: 9, 1870: 10, 1880: 11, 1890: 12}


### Check Tokenizer + Labels

In [13]:
print(f"number of train labels -> {len(train_labels)}")
print(f"length of train text(paragraphs) -> {len(train_text_data)}")
print()

print(f"number of test labels -> {len(test_labels)}")
print(f"length of test text -> {len(test_text_data)}")
print()

print(f"train text {train_text_data[0]}")
print(f"train label {train_labels[0]}")
print()

print(f"test text(paragraphs) {test_text_data[0]}")
print(f"test label {test_labels[0]}")

number of train labels -> 84860
length of train text(paragraphs) -> 84860

number of test labels -> 25538
length of test text -> 25538

train text Produced by Gary R. Young THE SCHOOL FOR SCANDAL A COMEDY A PORTRAIT<1> BY R. B. SHERIDAN, ESQ. Transcriber's Comments on the preparation of this E-Text: SQUARE BRACKETS: The square brackets, i.e. [ ] are copied from the printed book, without change, except that a closing bracket "]" has been added to the stage directions. FOOTNOTES: For this E-Text version of the book, the footnotes have been consolidated at the end of the play. Numbering of the footnotes has been changed, and each footnote is given a unique identity in the form <X>. CHANGES TO THE TEXT: Character names have been expanded. For Example, SIR BENJAMIN was SIR BEN. THE TEXT OF THE SCHOOL FOR SCANDAL The text of THE SCHOOL FOR SCANDAL in this edition is taken, by Mr. Fraser Rae's generous permission, from his SHERIDAN'S PLAYS NOW PRINTED AS HE WROTE THEM. In his Prefatory Notes 

In [14]:
train_sample = train_text_data[0]
train_sample_label = train_labels[0]
word_ids = text_tokenizer.tokenize_text_to_id(train_sample)

print(f"train sample -> {train_sample}")
print(f"train sample labe -> {train_sample_label}")
print(f"tokenized train_sample -> {word_ids}")
print(f"length of tokenized word {len(word_ids)}")

train sample -> Produced by Gary R. Young THE SCHOOL FOR SCANDAL A COMEDY A PORTRAIT<1> BY R. B. SHERIDAN, ESQ. Transcriber's Comments on the preparation of this E-Text: SQUARE BRACKETS: The square brackets, i.e. [ ] are copied from the printed book, without change, except that a closing bracket "]" has been added to the stage directions. FOOTNOTES: For this E-Text version of the book, the footnotes have been consolidated at the end of the play. Numbering of the footnotes has been changed, and each footnote is given a unique identity in the form <X>. CHANGES TO THE TEXT: Character names have been expanded. For Example, SIR BENJAMIN was SIR BEN. THE TEXT OF THE SCHOOL FOR SCANDAL The text of THE SCHOOL FOR SCANDAL in this edition is taken, by Mr. Fraser Rae's generous permission, from his SHERIDAN'S PLAYS NOW PRINTED AS HE WROTE THEM. In his Prefatory Notes (xxxvii), Mr. Rae writes: "The manuscript of it [THE SCHOOL FOR SCANDAL] in Sheridan's own handwriting is preserved at Frampton Cou

In [15]:
test_sample = test_text_data[0]
test_sample_label = test_labels[0]
word_ids = text_tokenizer.tokenize_text_to_id(test_sample)

print(f"test sample -> {test_sample}")
print(f"test sample label -> {test_sample_label}")
print(f"tokenized test_sample -> {word_ids}")
print(f"length of tokenized word {len(word_ids)}")

test sample -> An Inquiry into the Nature and Causes of the Wealth of Nations by Adam Smith Contents INTRODUCTION AND PLAN OF THE WORK. BOOK I. OF THE CAUSES OF IMPROVEMENT IN THE PRODUCTIVE POWERS OF LABOUR, AND OF THE ORDER ACCORDING TO WHICH ITS PRODUCE IS NATURALLY DISTRIBUTED AMONG THE DIFFERENT RANKS OF THE PEOPLE. CHAPTER I. OF THE DIVISION OF LABOUR. CHAPTER II. OF THE PRINCIPLE WHICH GIVES OCCASION TO THE DIVISION OF LABOUR. CHAPTER III. THAT THE DIVISION OF LABOUR IS LIMITED BY THE EXTENT OF THE MARKET. CHAPTER IV. OF THE ORIGIN AND USE OF MONEY. CHAPTER V. OF THE REAL AND NOMINAL PRICE OF COMMODITIES, OR OF THEIR PRICE IN LABOUR, AND THEIR PRICE IN MONEY. CHAPTER VI. OF THE COMPONENT PART OF THE PRICE OF COMMODITIES. CHAPTER VII. OF THE NATURAL AND MARKET PRICE OF COMMODITIES. CHAPTER VIII. OF THE WAGES OF LABOUR. CHAPTER IX. OF THE PROFITS OF STOCK. CHAPTER X. OF WAGES AND PROFIT IN THE DIFFERENT EMPLOYMENTS OF LABOUR AND STOCK. CHAPTER XI. OF THE RENT OF LAND. BOOK II. OF 