# RNN Architecture - Vanilla RNN

## Setup - Libraries, Packages, Embeddings, Paths

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import urllib.request
import zipfile 
from tqdm import tqdm 

In [2]:
embeddings_path = "./Embeddings"
def download_progress(block_num, block_size, total_size):
    if not hasattr(download_progress, "pbar"):
        download_progress.pbar = tqdm(total=total_size, unit="B", unit_scale=True)
    download_progress.pbar.update(block_size)

if not os.path.exists(embeddings_path):
    print(f"create directory to store pre-trained glove embeddings")
    os.makedirs(embeddings_path)
    print(f"download pre-trained Glove Embeddings")
    urllib.request.urlretrieve(
        "http://nlp.stanford.edu/data/glove.6B.zip",
        "./Embeddings/glove.6B.zip",
        download_progress,
    )
    print("unpack embeddings")
    with zipfile.ZipFile("./Embeddings/glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall("./Embeddings/")
    os.remove("./Embeddings/glove.6B.zip")
    
    print("embeddings download complete")

In [3]:
glove_6b_50_path = "./Embeddings/glove.6B.50d.txt"
glove_6b_100_path = "./Embeddings/glove.6B.100d.txt"
glove_6b_200_path = "./Embeddings/glove.6B.200d.txt"
glove_6b_300_path = "./Embeddings/glove.6B.300d.txt"
clean_train_split_path = "./Datasets/clean_train_split/"
clean_test_split_path = "./Datasets/clean_test_split"

In [4]:
import nltk 
nltk.download("punkt_tab")
import numpy as np 
from collections import defaultdict

class HistoricalTextTokenizer:
    """
    All of this code is adapted from Professor Johan Boye's DD2417 assignment tokenizers 
    """
    def __init__(self):
        self.word2id = defaultdict(lambda: None)
        self.id2word = defaultdict(lambda: None)
        self.latest_new_word = -1 
        self.tokens_processed = 0 

        self.UNKNOWN = '<unk>'
        self.PADDING_WORD = '<pad>'

        self.get_word_id(self.PADDING_WORD)
        self.get_word_id(self.UNKNOWN)

    def get_word_id(self, word):
        word = word.lower()
        if word in self.word2id:
            return self.word2id[word]
        else:
            self.latest_new_word += 1
            self.id2word[self.latest_new_word] = word
            self.word2id[word] = self.latest_new_word
            return self.latest_new_word

    def process_files(self, file_or_dir):
        all_texts = []
        all_labels = []

        if os.path.isdir(file_or_dir):
            decade_dirs = sorted([d for d in os.listdir(file_or_dir) if os.path.isdir(os.path.join(file_or_dir, d))])
            for decade_dir in decade_dirs:
                decade_path = os.path.join(file_or_dir, decade_dir)
                decade = int(decade_dir)
                print(f"Processing decade: {decade}")
                text_files = sorted([f for f in os.listdir(decade_path) if f.endswith(".txt")])
                # print(f"number of files in {decade} directory: {len(text_files)}")

                for file in text_files:
                    filepath = os.path.join(decade_path, file)
                    # print(f"tokenize file {file}")
                    text, labels = self.process_file(filepath, decade)
                    all_texts.extend(text)
                    all_labels.extend(labels)
        else:
            texts, labels = self.process_file(file_or_dir, 0)
            all_texts.extend(texts)
            all_labels.extend(labels)

        return all_texts, all_labels

    def process_file(self, filepath, decade):
        # print(filepath)
        stream = open(filepath, mode="r", encoding="utf-8", errors="ignore")
        text = stream.read()
        stream.close()

        try:
            self.tokens = nltk.word_tokenize(text)
        except LookupError:
            nltk.download("punkt")
            self.tokens = nltk.word_tokenize(text)

        for i, token in enumerate(self.tokens):
            self.tokens_processed += 1
            word_id = self.get_word_id(token)

            if self.tokens_processed % 1000000000 == 0:
                print("Processed", "{:,}".format(self.tokens_processed), "tokens")

        paragraphs = self.create_paragraphs(text)
        labels = [decade] * len(paragraphs)

        return paragraphs, labels

    def create_paragraphs(self, text, min_words=10, max_words=210):
        words = text.split()
        paragraphs = []
        start = 0

        while start < len(words):
            end = min(start + max_words, len(words))
            paragraph_words = words[start:end]
            if len(paragraph_words) >= min_words:
                paragraph_text = " ".join(paragraph_words)
                paragraphs.append(paragraph_text)
            start = end

        return paragraphs 

    def tokenize_text_to_id(self, text):
        try:
            tokens = nltk.word_tokenize(text.lower())
        except LookupError:
            nltk.download("punkt")
            tokens = nltk.word_tokenize(text.lower())
        word_ids = []
        for token in tokens:
            if token in self.word2id:
                word_ids.append(self.word2id[token])
            else:
                word_ids.append(self.word2id[self.UNKNOWN])
        return word_ids

    def get_vocab_size(self):
        return len(self.word2id)

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
text_tokenizer = HistoricalTextTokenizer()

In [6]:
train_text_data, train_labels = text_tokenizer.process_files(clean_train_split_path)

Processing decade: 1770
Processing decade: 1810
Processing decade: 1850
Processing decade: 1890


In [7]:
test_text_data, test_labels = text_tokenizer.process_files(clean_test_split_path)

Processing decade: 1770
Processing decade: 1810
Processing decade: 1850
Processing decade: 1890


In [8]:
labels = sorted(set(train_labels + test_labels))
decade_to_label = {decade: i for i, decade in enumerate(labels)}
print(f"{decade_to_label}") 

{1770: 0, 1810: 1, 1850: 2, 1890: 3}


In [9]:
print(f"number of train labels -> {len(train_labels)}")
print(f"length of train text(paragraphs) -> {len(train_text_data)}")
print()

print(f"number of test labels -> {len(test_labels)}")
print(f"length of test text -> {len(test_text_data)}")
print()

print(f"train text {train_text_data[0]}")
print(f"train label {train_labels[0]}")
print()

print(f"test text(paragraphs) {test_text_data[0]}")
print(f"test label {test_labels[0]}")

number of train labels -> 28336
length of train text(paragraphs) -> 28336

number of test labels -> 7886
length of test text -> 7886

train text Produced by Gary R. Young THE SCHOOL FOR SCANDAL A COMEDY A PORTRAIT<1> BY R. B. SHERIDAN, ESQ. Transcriber's Comments on the preparation of this E-Text: SQUARE BRACKETS: The square brackets, i.e. [ ] are copied from the printed book, without change, except that a closing bracket "]" has been added to the stage directions. FOOTNOTES: For this E-Text version of the book, the footnotes have been consolidated at the end of the play. Numbering of the footnotes has been changed, and each footnote is given a unique identity in the form <X>. CHANGES TO THE TEXT: Character names have been expanded. For Example, SIR BENJAMIN was SIR BEN. THE TEXT OF THE SCHOOL FOR SCANDAL The text of THE SCHOOL FOR SCANDAL in this edition is taken, by Mr. Fraser Rae's generous permission, from his SHERIDAN'S PLAYS NOW PRINTED AS HE WROTE THEM. In his Prefatory Notes (x

In [10]:
train_sample = train_text_data[0]
train_sample_label = train_labels[0]
word_ids = text_tokenizer.tokenize_text_to_id(train_sample)

print(f"train sample -> {train_sample}")
print(f"train sample labe -> {train_sample_label}")
print(f"tokenized train_sample -> {word_ids}")
print(f"length of tokenized word {len(word_ids)}")

train sample -> Produced by Gary R. Young THE SCHOOL FOR SCANDAL A COMEDY A PORTRAIT<1> BY R. B. SHERIDAN, ESQ. Transcriber's Comments on the preparation of this E-Text: SQUARE BRACKETS: The square brackets, i.e. [ ] are copied from the printed book, without change, except that a closing bracket "]" has been added to the stage directions. FOOTNOTES: For this E-Text version of the book, the footnotes have been consolidated at the end of the play. Numbering of the footnotes has been changed, and each footnote is given a unique identity in the form <X>. CHANGES TO THE TEXT: Character names have been expanded. For Example, SIR BENJAMIN was SIR BEN. THE TEXT OF THE SCHOOL FOR SCANDAL The text of THE SCHOOL FOR SCANDAL in this edition is taken, by Mr. Fraser Rae's generous permission, from his SHERIDAN'S PLAYS NOW PRINTED AS HE WROTE THEM. In his Prefatory Notes (xxxvii), Mr. Rae writes: "The manuscript of it [THE SCHOOL FOR SCANDAL] in Sheridan's own handwriting is preserved at Frampton Cou

In [11]:
test_sample = test_text_data[0]
test_sample_label = test_labels[0]
word_ids = text_tokenizer.tokenize_text_to_id(test_sample)

print(f"test sample -> {test_sample}")
print(f"test sample label -> {test_sample_label}")
print(f"tokenized test_sample -> {word_ids}")
print(f"length of tokenized word {len(word_ids)}")

test sample -> An Inquiry into the Nature and Causes of the Wealth of Nations by Adam Smith Contents INTRODUCTION AND PLAN OF THE WORK. BOOK I. OF THE CAUSES OF IMPROVEMENT IN THE PRODUCTIVE POWERS OF LABOUR, AND OF THE ORDER ACCORDING TO WHICH ITS PRODUCE IS NATURALLY DISTRIBUTED AMONG THE DIFFERENT RANKS OF THE PEOPLE. CHAPTER I. OF THE DIVISION OF LABOUR. CHAPTER II. OF THE PRINCIPLE WHICH GIVES OCCASION TO THE DIVISION OF LABOUR. CHAPTER III. THAT THE DIVISION OF LABOUR IS LIMITED BY THE EXTENT OF THE MARKET. CHAPTER IV. OF THE ORIGIN AND USE OF MONEY. CHAPTER V. OF THE REAL AND NOMINAL PRICE OF COMMODITIES, OR OF THEIR PRICE IN LABOUR, AND THEIR PRICE IN MONEY. CHAPTER VI. OF THE COMPONENT PART OF THE PRICE OF COMMODITIES. CHAPTER VII. OF THE NATURAL AND MARKET PRICE OF COMMODITIES. CHAPTER VIII. OF THE WAGES OF LABOUR. CHAPTER IX. OF THE PROFITS OF STOCK. CHAPTER X. OF WAGES AND PROFIT IN THE DIFFERENT EMPLOYMENTS OF LABOUR AND STOCK. CHAPTER XI. OF THE RENT OF LAND. BOOK II. OF 

In [12]:
from collections import Counter, defaultdict
import random 
def balance_paragraphs(train_data, train_labels, max_paragraphs_per_book=25, decade_paragraphs=600):
    original_paragraph_count = Counter(train_labels)
    max_paragraphs_per_book = float('inf')
    for decade, count in sorted(original_paragraph_count.items()):
        print(f"{decade}: {count} paragraphs")
        max_paragraphs_per_book = min(max_paragraphs_per_book, count)

    # Group by decade 
    decade_data = defaultdict(list)
    for text, label in zip(train_data, train_labels):
        decade_data[label].append(text)

    new_paragraphs = []
    new_labels = []
    for decade, texts in decade_data.items():
        book_size = 50 
        new_books = [texts[i:i+book_size] for i in range(0, len(texts), book_size)]
        decade_books = []
        for book in new_books:
            if len(book) > max_paragraphs_per_book:
                sample_paragraphs = random.sample(book, max_paragraphs_per_book)
            else:
                sample_paragraphs = book 
            decade_books.extend(sample_paragraphs)
                
        new_paragraphs.extend(decade_books)  
        new_labels.extend([decade] * len(decade_books))

    # Balance decades 
    new_decade_data = defaultdict(list)
    for text, label in zip(new_paragraphs, new_labels):
        new_decade_data[label].append(text)

    min_paragraph_size = min(len(text) for text in new_decade_data.values())
    new_decade_paragraphs = min(decade_paragraphs, min_paragraph_size)

    balance_paragraphs = []
    balance_labels = []

    for decade in sorted(new_decade_data.keys()):      
        text = new_decade_data[decade]                  
        if len(text) >= new_decade_paragraphs:          
            sample_paragraphs = random.sample(text, new_decade_paragraphs) 
        else:
            sample_paragraphs = random.choices(text, k=new_decade_paragraphs)
    
        balance_paragraphs.extend(sample_paragraphs)    
        balance_labels.extend([decade] * len(sample_paragraphs))  

    return balance_paragraphs, balance_labels

In [13]:
import csv
from tqdm import tqdm
import string
import codecs
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

In [14]:
print(f"create new tokenizer")
tokenizer = HistoricalTextTokenizer()

print(f"tokenize -> {clean_train_split_path}")
original_train_data, original_train_labels = tokenizer.process_files(clean_train_split_path)
print(f"succesfully tokenized <- {clean_train_split_path}")
print(f"balance training data -> {len(original_train_data)} and balance labels: {len(original_train_labels)}")
balance_train_data, balance_train_labels = balance_paragraphs(original_train_data, original_train_labels)
print(f"succesfully balanced train data and labels <-")

# create train/validation splits 
from sklearn.model_selection import train_test_split
model_train_data, model_valid_data, model_train_labels, model_valid_labels = train_test_split(
    balance_train_data, balance_train_labels, 
    test_size=0.2, random_state=42, stratify=balance_train_labels
)

print(f"tokenize -> {clean_test_split_path}")
test_data, test_labels = tokenizer.process_files(clean_test_split_path)
print(f"succesfully tokenized <- {clean_test_split_path}")

print(f"create decade labels")
decades = sorted(set(model_train_labels + test_labels))
decade_to_label = {decade: i for i, decade in enumerate(decades)}
label_to_decade = {i: decade for i, decade in enumerate(decades)}
print(f"successfully created decades labels")

UNKNOWN = "<unk>"  # Unknown char or unknown word
PADDING_WORD = "<pad>"
id_to_label = [f"decade_{decade}" for decade in decades]

def label_to_id(decade):
    return decade_to_label[decade]

create new tokenizer
tokenize -> ./Datasets/clean_train_split/
Processing decade: 1770
Processing decade: 1810
Processing decade: 1850
Processing decade: 1890
succesfully tokenized <- ./Datasets/clean_train_split/
balance training data -> 28336 and balance labels: 28336
1770: 10579 paragraphs
1810: 7485 paragraphs
1850: 6847 paragraphs
1890: 3425 paragraphs
succesfully balanced train data and labels <-
tokenize -> ./Datasets/clean_test_split
Processing decade: 1770
Processing decade: 1810
Processing decade: 1850
Processing decade: 1890
succesfully tokenized <- ./Datasets/clean_test_split
create decade labels
successfully created decades labels


In [15]:
def load_glove_embeddings_aligned(
    embedding_file, tokenizer, padding_word=PADDING_WORD, unknown_word=UNKNOWN
):
    """
    Reads Glove embeddings from a file and aligns them with tokenizer vocabulary.
    """
    glove_vectors = {}
    D = None
    
    with open(embedding_file, encoding="utf8") as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            if D is None:
                D = len(vec)
            glove_vectors[word] = vec
    
    print(f"Loaded {len(glove_vectors)} GloVe vectors with dimension {D}")
    
    embeddings = []
    vocab_size = len(tokenizer.word2id)
    found_in_glove = 0
    
    # each word in tokenizer vocabulary
    for word_id in range(vocab_size):
        word = tokenizer.id2word[word_id]
        
        if word == padding_word:
            embeddings.append([0] * D)  
        elif word == unknown_word:
            embeddings.append([-1] * D)  
        elif word in glove_vectors:
            embeddings.append(glove_vectors[word])
            found_in_glove += 1
        else:
            embeddings.append([0.1] * D)
    
    print(f"Found {found_in_glove}/{vocab_size} words from your vocabulary in GloVe")
    print(f"Coverage: {100 * found_in_glove / vocab_size:.2f}%")
    
    return D, tokenizer.word2id, embeddings

In [16]:
class HistoricalTextDataset(Dataset):
    """
    A class loading NER dataset from a CSV file to be used as an input
    to PyTorch DataLoader.

    The CSV file has 4 fields: sentence number (only at the start of a new
    sentence), word, POS tag (ignored), and label.

    Datapoints are sentences + associated labels for each word. If the
    words have not been seen before (i.e, they are not found in the
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    """

    def __init__(self, texts, labels, word_to_id, decade_to_label):
        self.texts = texts 
        self.labels = labels 
        self.word_to_id = word_to_id
        self.decade_to_label = decade_to_label

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        decade = self.labels[idx]
        label_id = self.decade_to_label[decade]

        return text, label_id

In [17]:
# Let's check out some of these data structures
dim, word_to_id, embeddings = load_glove_embeddings_aligned(glove_6b_100_path, tokenizer)

print("The embedding for the word 'good' looks like this:")
print(embeddings[word_to_id["good"]])
print()

# Read the data we are going to use for testing the model
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label
)
print("There are", len(test_set), "documents in the testset")
dp = 0
text, label = test_set[dp]
print("Document", dp, "starts with:", text[:100], "...")
print("It has the label", label, "which corresponds to decade", label_to_decade[label])

Loaded 400000 GloVe vectors with dimension 100
Found 55798/146797 words from your vocabulary in GloVe
Coverage: 38.01%
The embedding for the word 'good' looks like this:
[-0.030769, 0.11993, 0.53909, -0.43696, -0.73937, -0.15345, 0.081126, -0.38559, -0.68797, -0.41632, -0.13183, -0.24922, 0.441, 0.085919, 0.20871, -0.063582, 0.062228, -0.051234, -0.13398, 1.1418, 0.036526, 0.49029, -0.24567, -0.412, 0.12349, 0.41336, -0.48397, -0.54243, -0.27787, -0.26015, -0.38485, 0.78656, 0.1023, -0.20712, 0.40751, 0.32026, -0.51052, 0.48362, -0.0099498, -0.38685, 0.034975, -0.167, 0.4237, -0.54164, -0.30323, -0.36983, 0.082836, -0.52538, -0.064531, -1.398, -0.14873, -0.35327, -0.1118, 1.0912, 0.095864, -2.8129, 0.45238, 0.46213, 1.6012, -0.20837, -0.27377, 0.71197, -1.0754, -0.046974, 0.67479, -0.065839, 0.75824, 0.39405, 0.15507, -0.64719, 0.32796, -0.031748, 0.52899, -0.43886, 0.67405, 0.42136, -0.11981, -0.21777, -0.29756, -0.1351, 0.59898, 0.46529, -0.58258, -0.02323, -1.5442, 0.01901, -0.01587

In [18]:
def pad_sequence_documents(batch, padding_word=PADDING_WORD):
    """
    Dynamic Padding
    """
    batch_data, batch_labels = zip(*batch)
    
    # Convert documents to word ID sequences
    batch_sequences = []
    for text in batch_data:
        word_ids = tokenizer.tokenize_text_to_id(text)
        batch_sequences.append(word_ids)
    max_len = max(map(len, batch_sequences))
    padding_id = tokenizer.word2id[padding_word]
    
    padded_data = [[seq[i] if i < len(seq) else padding_id for i in range(max_len)] 
                   for seq in batch_sequences]
    
    return padded_data, list(batch_labels)

In [19]:
x = [(model_train_data[0], model_train_labels[0])]
pad_sequence_documents(x)

([[105,
   67,
   7,
   1172,
   21695,
   1697,
   64,
   2278,
   52,
   194,
   64,
   850,
   21411,
   19,
   2906,
   19,
   64,
   105,
   19074,
   130,
   52,
   285,
   21,
   105,
   67,
   7,
   4017,
   27,
   11,
   13376,
   1099,
   2054,
   19,
   71,
   1068,
   161,
   82,
   163,
   12821,
   310,
   95,
   79654,
   28,
   2731,
   195,
   91,
   133,
   19,
   81783,
   64,
   108631,
   105,
   71,
   91,
   567,
   19,
   64,
   71,
   91,
   275,
   15053,
   105,
   52,
   7903,
   21,
   118,
   1246,
   3,
   850,
   422,
   44,
   105,
   67,
   5909,
   3,
   7,
   1172,
   2347,
   343,
   64,
   118,
   162,
   1449,
   899,
   19,
   93347,
   19,
   67,
   105,
   301,
   394,
   105,
   67,
   163,
   3797,
   27,
   1266,
   19,
   52,
   12878,
   11,
   427,
   343,
   118,
   162,
   224,
   382,
   6944,
   52,
   3979,
   64,
   436,
   122,
   118,
   2599,
   52,
   93347,
   120,
   21,
   118,
   932,
   850,
   50147,
   10979,
   19,
   64

In [20]:
class DocumentClassifier(nn.Module):
    def __init__(
        self,
        word_embeddings,  # Pre-trained word embeddings
        word_to_id,  # Mapping from words to ids
        num_classes,  # Number of decades to classify
        word_hidden_size=128,
        padding_word=PADDING_WORD,
        unknown_word=UNKNOWN,
        dropout_rate=0.3,
        num_layers=1,  # Keep as 1 for vanilla RNN
        device="cpu",
    ):
        super(DocumentClassifier, self).__init__()
        self.padding_word = padding_word
        self.unknown_word = unknown_word
        self.word_to_id = word_to_id
        self.word_hidden_size = word_hidden_size
        self.device = device
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.num_layers = num_layers

        # Create an embedding tensor for the words and import the Glove embeddings
        vocabulary_size = len(word_embeddings)
        self.word_emb_size = len(word_embeddings[0])

        self.word_emb = nn.Embedding(vocabulary_size, self.word_emb_size)
        self.word_emb.weight = nn.Parameter(
            torch.tensor(word_embeddings, dtype=torch.float), requires_grad=False
        )
        self.embedding_dropout = nn.Dropout(dropout_rate * 0.3)
        self.output_dropout = nn.Dropout(dropout_rate)

        self.word_rnn = nn.RNN(
            self.word_emb_size,
            self.word_hidden_size,
            num_layers=1,
            batch_first=True,
            dropout=0,  
            nonlinearity='tanh' 
        )

        # Document Classification
        self.final_pred = nn.Linear(self.word_hidden_size, num_classes)

    def forward(self, x):
        batch_size, seq_length = x.shape
        word_embeddings = self.word_emb(x)
        word_embeddings = self.embedding_dropout(word_embeddings)

        rnn_output, hidden = self.word_rnn(word_embeddings)

        complete_doc = hidden[0]  

        complete_doc = self.output_dropout(complete_doc)
        logits = self.final_pred(complete_doc)

        return logits

In [21]:
# # ================== Hyper-parameters ==================== #

learning_rate = 0.01
epochs = 10
batch_size = 8
# ======================= Training (First 50 documents) ======================= #

if torch.backends.mps.is_available():
    device = 'mps'
    print("Running on MGPU")
elif torch.cuda.is_available():
    device = 'cuda'
    print("Running on CUDA")
else:
    device = 'cpu'
    print("Running on CPU")

dim, word_to_id, embeddings = load_glove_embeddings_aligned(glove_6b_300_path, tokenizer)

# Use only first 50 documents for testing
print("Using first 10000 documents for testing...")
train_data_small = model_train_data[:1000]
train_labels_small = model_train_labels[:1000]
valid_data_small = model_valid_data[:200]
valid_labels_small = model_valid_labels[:200]
test_data_small = test_data[:100] 
test_labels_small = test_labels[:100]

training_set = HistoricalTextDataset(train_data_small, train_labels_small, word_to_id, decade_to_label)
validation_set = HistoricalTextDataset(valid_data_small, valid_labels_small, word_to_id, decade_to_label)
test_set = HistoricalTextDataset(test_data_small, test_labels_small, word_to_id, decade_to_label)

training_loader = DataLoader(training_set, batch_size=batch_size, collate_fn=pad_sequence_documents)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False, collate_fn=pad_sequence_documents)
test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=pad_sequence_documents)

print(f"Training on {len(training_set)} documents")
print(f"Validation on {len(validation_set)} documents")
print(f"Testing on {len(test_set)} documents")

lstm_classifier = DocumentClassifier(
    word_embeddings=embeddings,
    word_to_id=word_to_id,
    num_classes=len(decades), 
    word_hidden_size=128, 
    dropout_rate=0.2,
    num_layers=1,
    device=device,
).to(device)

optimizer = optim.Adam(lstm_classifier.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    # TRAINING PHASE
    lstm_classifier.train()
    epoch_loss = 0
    correct = 0 
    total = 0
    
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        
        optimizer.zero_grad()
        logits = lstm_classifier(x)
        loss = criterion(logits, y)
        loss.backward()
        clip_grad_norm_(lstm_classifier.parameters(), 1.0)
        optimizer.step()
        
        # Track training accuracy
        epoch_loss += loss.item()
        _, predicted = torch.max(logits.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
    
    # VALIDATION PHASE
    lstm_classifier.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for x, y in validation_loader:
            x = torch.tensor(x, dtype=torch.long).to(device)
            y = torch.tensor(y, dtype=torch.long).to(device)
            
            logits = lstm_classifier(x)
            loss = criterion(logits, y)
            
            val_loss += loss.item()
            _, predicted = torch.max(logits.data, 1)
            val_total += y.size(0)
            val_correct += (predicted == y).sum().item()
    
    # Calculate metrics
    val_acc = 100 * val_correct / val_total
    avg_val_loss = val_loss / len(validation_loader)
    train_acc = 100 * correct / total
    avg_loss = epoch_loss / len(training_loader)
    
    # Update scheduler
    scheduler.step()
    
    # Print results
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")

Running on CUDA
Loaded 400000 GloVe vectors with dimension 300
Found 55798/146797 words from your vocabulary in GloVe
Coverage: 38.01%
Using first 10000 documents for testing...
Training on 1000 documents
Validation on 200 documents
Testing on 100 documents


Epoch 1: 100%|██████████| 125/125 [00:01<00:00, 80.04it/s] 


Epoch [1/10], Train Loss: 1.7273, Train Acc: 22.90%, Val Loss: 1.5581, Val Acc: 34.00%


Epoch 2: 100%|██████████| 125/125 [00:01<00:00, 104.60it/s]


Epoch [2/10], Train Loss: 1.6227, Train Acc: 25.70%, Val Loss: 1.4563, Val Acc: 34.50%


Epoch 3: 100%|██████████| 125/125 [00:01<00:00, 98.57it/s] 


Epoch [3/10], Train Loss: 1.5241, Train Acc: 27.60%, Val Loss: 1.3812, Val Acc: 31.00%


Epoch 4: 100%|██████████| 125/125 [00:01<00:00, 99.20it/s]


Epoch [4/10], Train Loss: 1.5071, Train Acc: 26.80%, Val Loss: 1.3886, Val Acc: 29.50%


Epoch 5: 100%|██████████| 125/125 [00:01<00:00, 101.62it/s]


Epoch [5/10], Train Loss: 1.4687, Train Acc: 24.80%, Val Loss: 1.3810, Val Acc: 28.50%


Epoch 6: 100%|██████████| 125/125 [00:01<00:00, 99.86it/s] 


Epoch [6/10], Train Loss: 1.4588, Train Acc: 24.70%, Val Loss: 1.3796, Val Acc: 28.00%


Epoch 7: 100%|██████████| 125/125 [00:01<00:00, 104.81it/s]


Epoch [7/10], Train Loss: 1.4373, Train Acc: 24.20%, Val Loss: 1.4027, Val Acc: 25.00%


Epoch 8: 100%|██████████| 125/125 [00:01<00:00, 103.67it/s]


Epoch [8/10], Train Loss: 1.4156, Train Acc: 28.20%, Val Loss: 1.4204, Val Acc: 20.50%


Epoch 9: 100%|██████████| 125/125 [00:01<00:00, 100.28it/s]


Epoch [9/10], Train Loss: 1.4243, Train Acc: 26.80%, Val Loss: 1.4168, Val Acc: 24.50%


Epoch 10: 100%|██████████| 125/125 [00:01<00:00, 103.48it/s]


Epoch [10/10], Train Loss: 1.4192, Train Acc: 27.00%, Val Loss: 1.3883, Val Acc: 28.50%


In [22]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

lstm_classifier.eval()

# Create test dataset and loader
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label)
test_loader = DataLoader(test_set, batch_size=16, collate_fn=pad_sequence_documents)

all_predictions = []
all_labels = []

with torch.no_grad():
    for x, y in test_loader:
        # Convert to tensors
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        pred = torch.argmax(lstm_classifier(x), dim=-1).detach().cpu().numpy()
        y_np = y.detach().cpu().numpy()
        
        all_predictions.extend(pred)
        all_labels.extend(y_np)

# confusion matrix 
num_classes = len(decade_to_label)
confusion_matrix_manual = [[0 for _ in range(num_classes)] for _ in range(num_classes)]

for i in range(len(all_predictions)):
    actual = all_labels[i]
    predicted = all_predictions[i]
    confusion_matrix_manual[actual][predicted] += 1

# Print results
print("Confusion Matrix:")
print("Predicted ->", [f"D{label_to_decade[i]}" for i in range(num_classes)])
for i, row in enumerate(confusion_matrix_manual):
    print(f"Actual D{label_to_decade[i]}: {row}")

accuracy = sum(confusion_matrix_manual[i][i] for i in range(num_classes)) / sum(sum(row) for row in confusion_matrix_manual)
print(f"Accuracy: {accuracy:.4f}")

Confusion Matrix:
Predicted -> ['D1770', 'D1810', 'D1850', 'D1890']
Actual D1770: [945, 510, 605, 40]
Actual D1810: [711, 399, 341, 18]
Actual D1850: [1753, 986, 909, 65]
Actual D1890: [304, 180, 113, 7]
Accuracy: 0.2866


In [23]:
# ================== Hyper-parameters ==================== #
learning_rate = 0.001  
epochs = 8
batch_size = 24 
weight_decay = 1e-4

# ======================= Training ======================= #
if torch.backends.mps.is_available():
    device = "mps"
    print("Running on MGPU")
elif torch.cuda.is_available():
    device = "cuda"
    print("Running on CUDA")
else:
    device = "cpu"
    print("Running on CPU")

dim, word_to_id, embeddings = load_glove_embeddings_aligned(glove_6b_50_path, tokenizer)

training_set = HistoricalTextDataset(model_train_data, model_train_labels, word_to_id, decade_to_label)
validation_set = HistoricalTextDataset(model_valid_data, model_valid_labels, word_to_id, decade_to_label)
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label)

training_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True, collate_fn=pad_sequence_documents)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False, collate_fn=pad_sequence_documents)
test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=pad_sequence_documents)

print(f"Training on {len(training_set)} documents")
print(f"Validation on {len(validation_set)} documents")
print(f"Testing on {len(test_set)} documents")

lstm_classifier = DocumentClassifier(
    word_embeddings=embeddings,
    word_to_id=word_to_id,
    num_classes=len(decades), 
    word_hidden_size=64, 
    num_layers=1,
    device=device,
).to(device)

optimizer = optim.Adam(lstm_classifier.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    # TRAINING PHASE
    lstm_classifier.train()
    epoch_loss = 0
    correct = 0 
    total = 0
    
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        
        optimizer.zero_grad()
        logits = lstm_classifier(x)
        loss = criterion(logits, y)
        loss.backward()
        clip_grad_norm_(lstm_classifier.parameters(), 5)
        optimizer.step()
        
        # Track training accuracy
        epoch_loss += loss.item()
        _, predicted = torch.max(logits.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
    
    # VALIDATION PHASE
    lstm_classifier.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for x, y in validation_loader:
            x = torch.tensor(x, dtype=torch.long).to(device)
            y = torch.tensor(y, dtype=torch.long).to(device)
            
            logits = lstm_classifier(x)
            loss = criterion(logits, y)
            
            val_loss += loss.item()
            _, predicted = torch.max(logits.data, 1)
            val_total += y.size(0)
            val_correct += (predicted == y).sum().item()
    
    # Calculate metrics
    val_acc = 100 * val_correct / val_total
    avg_val_loss = val_loss / len(validation_loader)
    train_acc = 100 * correct / total
    avg_loss = epoch_loss / len(training_loader)
    
    # Update scheduler
    scheduler.step()
    
    # Print results
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")

Running on CUDA
Loaded 400000 GloVe vectors with dimension 50
Found 55798/146797 words from your vocabulary in GloVe
Coverage: 38.01%
Training on 1920 documents
Validation on 480 documents
Testing on 7886 documents


Epoch 1: 100%|██████████| 80/80 [00:02<00:00, 39.05it/s]


Epoch [1/8], Train Loss: 1.3882, Train Acc: 26.15%, Val Loss: 1.3845, Val Acc: 26.46%


Epoch 2: 100%|██████████| 80/80 [00:02<00:00, 38.54it/s]


Epoch [2/8], Train Loss: 1.3841, Train Acc: 25.62%, Val Loss: 1.3867, Val Acc: 25.62%


Epoch 3: 100%|██████████| 80/80 [00:02<00:00, 38.36it/s]


Epoch [3/8], Train Loss: 1.3806, Train Acc: 24.69%, Val Loss: 1.3864, Val Acc: 26.25%


Epoch 4: 100%|██████████| 80/80 [00:02<00:00, 39.02it/s]


Epoch [4/8], Train Loss: 1.3793, Train Acc: 26.15%, Val Loss: 1.3848, Val Acc: 25.83%


Epoch 5: 100%|██████████| 80/80 [00:02<00:00, 38.71it/s]


Epoch [5/8], Train Loss: 1.3774, Train Acc: 26.77%, Val Loss: 1.3846, Val Acc: 26.67%


Epoch 6: 100%|██████████| 80/80 [00:02<00:00, 39.72it/s]


Epoch [6/8], Train Loss: 1.3810, Train Acc: 25.68%, Val Loss: 1.3841, Val Acc: 26.46%


Epoch 7: 100%|██████████| 80/80 [00:02<00:00, 39.26it/s]


Epoch [7/8], Train Loss: 1.3776, Train Acc: 25.36%, Val Loss: 1.3839, Val Acc: 26.67%


Epoch 8: 100%|██████████| 80/80 [00:02<00:00, 39.19it/s]


Epoch [8/8], Train Loss: 1.3785, Train Acc: 26.25%, Val Loss: 1.3837, Val Acc: 26.67%


In [24]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

lstm_classifier.eval()

# Create test dataset and loader
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label)
test_loader = DataLoader(test_set, batch_size=batch_size, collate_fn=pad_sequence_documents)

all_predictions = []
all_labels = []

with torch.no_grad():
    for x, y in test_loader:
        # Convert to tensors
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        pred = torch.argmax(lstm_classifier(x), dim=-1).detach().cpu().numpy()
        y_np = y.detach().cpu().numpy()
        
        all_predictions.extend(pred)
        all_labels.extend(y_np)

# confusion matrix 
num_classes = len(decade_to_label)
confusion_matrix_manual = [[0 for _ in range(num_classes)] for _ in range(num_classes)]

for i in range(len(all_predictions)):
    actual = all_labels[i]
    predicted = all_predictions[i]
    confusion_matrix_manual[actual][predicted] += 1

# Print results
print("Confusion Matrix:")
print("Predicted ->", [f"D{label_to_decade[i]}" for i in range(num_classes)])
for i, row in enumerate(confusion_matrix_manual):
    print(f"Actual D{label_to_decade[i]}: {row}")

accuracy = sum(confusion_matrix_manual[i][i] for i in range(num_classes)) / sum(sum(row) for row in confusion_matrix_manual)
print(f"Accuracy: {accuracy:.4f}")

Confusion Matrix:
Predicted -> ['D1770', 'D1810', 'D1850', 'D1890']
Actual D1770: [72, 1672, 165, 191]
Actual D1810: [20, 1329, 82, 38]
Actual D1850: [43, 3402, 196, 72]
Actual D1890: [6, 536, 36, 26]
Accuracy: 0.2058
