# RNN Architecture - GRU RNN

## Setup - Libraries, Packages, Embeddings, Paths

In [41]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os 
import urllib.request
import zipfile 
from tqdm import tqdm 

In [42]:
embeddings_path = "./Embeddings"
def download_progress(block_num, block_size, total_size):
    if not hasattr(download_progress, "pbar"):
        download_progress.pbar = tqdm(total=total_size, unit="B", unit_scale=True)
    download_progress.pbar.update(block_size)

if not os.path.exists(embeddings_path):
    print(f"create directory to store pre-trained glove embeddings")
    os.makedirs(embeddings_path)
    print(f"download pre-trained Glove Embeddings")
    urllib.request.urlretrieve(
        "http://nlp.stanford.edu/data/glove.6B.zip",
        "./Embeddings/glove.6B.zip",
        download_progress,
    )
    print("unpack embeddings")
    with zipfile.ZipFile("./Embeddings/glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall("./Embeddings/")
    os.remove("./Embeddings/glove.6B.zip")
    
    print("embeddings download complete")

In [43]:
glove_6b_50_path = "./Embeddings/glove.6B.50d.txt"
glove_6b_100_path = "./Embeddings/glove.6B.100d.txt"
glove_6b_200_path = "./Embeddings/glove.6B.200d.txt"
glove_6b_300_path = "./Embeddings/glove.6B.300d.txt"
clean_train_split_path = "./Datasets/clean_train_split/"
clean_test_split_path = "./Datasets/clean_test_split"

In [44]:
import nltk 
nltk.download("punkt_tab")
import numpy as np 
from collections import defaultdict

class HistoricalTextTokenizer:
    """
    All of this code is adapted from Professor Johan Boye's DD2417 assignment tokenizers 
    """
    def __init__(self):
        self.word2id = defaultdict(lambda: None)
        self.id2word = defaultdict(lambda: None)
        self.latest_new_word = -1 
        self.tokens_processed = 0 

        self.UNKNOWN = '<unk>'
        self.PADDING_WORD = '<pad>'

        self.get_word_id(self.PADDING_WORD)
        self.get_word_id(self.UNKNOWN)

    def get_word_id(self, word):
        word = word.lower()
        if word in self.word2id:
            return self.word2id[word]
        else:
            self.latest_new_word += 1
            self.id2word[self.latest_new_word] = word
            self.word2id[word] = self.latest_new_word
            return self.latest_new_word

    def process_files(self, file_or_dir):
        all_texts = []
        all_labels = []

        if os.path.isdir(file_or_dir):
            decade_dirs = sorted([d for d in os.listdir(file_or_dir) if os.path.isdir(os.path.join(file_or_dir, d))])
            for decade_dir in decade_dirs:
                decade_path = os.path.join(file_or_dir, decade_dir)
                decade = int(decade_dir)
                print(f"Processing decade: {decade}")
                text_files = sorted([f for f in os.listdir(decade_path) if f.endswith(".txt")])
                # print(f"number of files in {decade} directory: {len(text_files)}")

                for file in text_files:
                    filepath = os.path.join(decade_path, file)
                    # print(f"tokenize file {file}")
                    text, labels = self.process_file(filepath, decade)
                    all_texts.extend(text)
                    all_labels.extend(labels)
        else:
            texts, labels = self.process_file(file_or_dir, 0)
            all_texts.extend(texts)
            all_labels.extend(labels)

        return all_texts, all_labels

    def process_file(self, filepath, decade):
        # print(filepath)
        stream = open(filepath, mode="r", encoding="utf-8", errors="ignore")
        text = stream.read()
        stream.close()

        try:
            self.tokens = nltk.word_tokenize(text)
        except LookupError:
            nltk.download("punkt")
            self.tokens = nltk.word_tokenize(text)

        for i, token in enumerate(self.tokens):
            self.tokens_processed += 1
            word_id = self.get_word_id(token)

            if self.tokens_processed % 1000000000 == 0:
                print("Processed", "{:,}".format(self.tokens_processed), "tokens")

        paragraphs = self.create_paragraphs(text)
        labels = [decade] * len(paragraphs)

        return paragraphs, labels

    def create_paragraphs(self, text, min_words=10, max_words=210):
        words = text.split()
        paragraphs = []
        start = 0

        while start < len(words):
            end = min(start + max_words, len(words))
            paragraph_words = words[start:end]
            if len(paragraph_words) >= min_words:
                paragraph_text = " ".join(paragraph_words)
                paragraphs.append(paragraph_text)
            start = end

        return paragraphs 

    def tokenize_text_to_id(self, text):
        try:
            tokens = nltk.word_tokenize(text.lower())
        except LookupError:
            nltk.download("punkt")
            tokens = nltk.word_tokenize(text.lower())
        word_ids = []
        for token in tokens:
            if token in self.word2id:
                word_ids.append(self.word2id[token])
            else:
                word_ids.append(self.word2id[self.UNKNOWN])
        return word_ids

    def get_vocab_size(self):
        return len(self.word2id)

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [45]:
text_tokenizer = HistoricalTextTokenizer()

In [46]:
train_text_data, train_labels = text_tokenizer.process_files(clean_train_split_path)

Processing decade: 1770
Processing decade: 1780
Processing decade: 1790
Processing decade: 1800
Processing decade: 1810
Processing decade: 1820
Processing decade: 1830
Processing decade: 1840
Processing decade: 1850
Processing decade: 1860
Processing decade: 1870
Processing decade: 1880
Processing decade: 1890


In [47]:
test_text_data, test_labels = text_tokenizer.process_files(clean_test_split_path)

Processing decade: 1770
Processing decade: 1780
Processing decade: 1790
Processing decade: 1800
Processing decade: 1810
Processing decade: 1820
Processing decade: 1830
Processing decade: 1840
Processing decade: 1850
Processing decade: 1860
Processing decade: 1870
Processing decade: 1880
Processing decade: 1890


In [48]:
labels = sorted(set(train_labels + test_labels))
decade_to_label = {decade: i for i, decade in enumerate(labels)}
print(f"{decade_to_label}") 

{1770: 0, 1780: 1, 1790: 2, 1800: 3, 1810: 4, 1820: 5, 1830: 6, 1840: 7, 1850: 8, 1860: 9, 1870: 10, 1880: 11, 1890: 12}


In [49]:
print(f"number of train labels -> {len(train_labels)}")
print(f"length of train text(paragraphs) -> {len(train_text_data)}")
print()

print(f"number of test labels -> {len(test_labels)}")
print(f"length of test text -> {len(test_text_data)}")
print()

print(f"train text {train_text_data[0]}")
print(f"train label {train_labels[0]}")
print()

print(f"test text(paragraphs) {test_text_data[0]}")
print(f"test label {test_labels[0]}")

number of train labels -> 84860
length of train text(paragraphs) -> 84860

number of test labels -> 25538
length of test text -> 25538

train text Produced by Gary R. Young THE SCHOOL FOR SCANDAL A COMEDY A PORTRAIT<1> BY R. B. SHERIDAN, ESQ. Transcriber's Comments on the preparation of this E-Text: SQUARE BRACKETS: The square brackets, i.e. [ ] are copied from the printed book, without change, except that a closing bracket "]" has been added to the stage directions. FOOTNOTES: For this E-Text version of the book, the footnotes have been consolidated at the end of the play. Numbering of the footnotes has been changed, and each footnote is given a unique identity in the form <X>. CHANGES TO THE TEXT: Character names have been expanded. For Example, SIR BENJAMIN was SIR BEN. THE TEXT OF THE SCHOOL FOR SCANDAL The text of THE SCHOOL FOR SCANDAL in this edition is taken, by Mr. Fraser Rae's generous permission, from his SHERIDAN'S PLAYS NOW PRINTED AS HE WROTE THEM. In his Prefatory Notes 

In [50]:
train_sample = train_text_data[0]
train_sample_label = train_labels[0]
word_ids = text_tokenizer.tokenize_text_to_id(train_sample)

print(f"train sample -> {train_sample}")
print(f"train sample labe -> {train_sample_label}")
print(f"tokenized train_sample -> {word_ids}")
print(f"length of tokenized word {len(word_ids)}")

train sample -> Produced by Gary R. Young THE SCHOOL FOR SCANDAL A COMEDY A PORTRAIT<1> BY R. B. SHERIDAN, ESQ. Transcriber's Comments on the preparation of this E-Text: SQUARE BRACKETS: The square brackets, i.e. [ ] are copied from the printed book, without change, except that a closing bracket "]" has been added to the stage directions. FOOTNOTES: For this E-Text version of the book, the footnotes have been consolidated at the end of the play. Numbering of the footnotes has been changed, and each footnote is given a unique identity in the form <X>. CHANGES TO THE TEXT: Character names have been expanded. For Example, SIR BENJAMIN was SIR BEN. THE TEXT OF THE SCHOOL FOR SCANDAL The text of THE SCHOOL FOR SCANDAL in this edition is taken, by Mr. Fraser Rae's generous permission, from his SHERIDAN'S PLAYS NOW PRINTED AS HE WROTE THEM. In his Prefatory Notes (xxxvii), Mr. Rae writes: "The manuscript of it [THE SCHOOL FOR SCANDAL] in Sheridan's own handwriting is preserved at Frampton Cou

In [51]:
test_sample = test_text_data[0]
test_sample_label = test_labels[0]
word_ids = text_tokenizer.tokenize_text_to_id(test_sample)

print(f"test sample -> {test_sample}")
print(f"test sample label -> {test_sample_label}")
print(f"tokenized test_sample -> {word_ids}")
print(f"length of tokenized word {len(word_ids)}")

test sample -> An Inquiry into the Nature and Causes of the Wealth of Nations by Adam Smith Contents INTRODUCTION AND PLAN OF THE WORK. BOOK I. OF THE CAUSES OF IMPROVEMENT IN THE PRODUCTIVE POWERS OF LABOUR, AND OF THE ORDER ACCORDING TO WHICH ITS PRODUCE IS NATURALLY DISTRIBUTED AMONG THE DIFFERENT RANKS OF THE PEOPLE. CHAPTER I. OF THE DIVISION OF LABOUR. CHAPTER II. OF THE PRINCIPLE WHICH GIVES OCCASION TO THE DIVISION OF LABOUR. CHAPTER III. THAT THE DIVISION OF LABOUR IS LIMITED BY THE EXTENT OF THE MARKET. CHAPTER IV. OF THE ORIGIN AND USE OF MONEY. CHAPTER V. OF THE REAL AND NOMINAL PRICE OF COMMODITIES, OR OF THEIR PRICE IN LABOUR, AND THEIR PRICE IN MONEY. CHAPTER VI. OF THE COMPONENT PART OF THE PRICE OF COMMODITIES. CHAPTER VII. OF THE NATURAL AND MARKET PRICE OF COMMODITIES. CHAPTER VIII. OF THE WAGES OF LABOUR. CHAPTER IX. OF THE PROFITS OF STOCK. CHAPTER X. OF WAGES AND PROFIT IN THE DIFFERENT EMPLOYMENTS OF LABOUR AND STOCK. CHAPTER XI. OF THE RENT OF LAND. BOOK II. OF 

In [52]:
import csv
from tqdm import tqdm
import string
import codecs
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

In [53]:
print(f"create new tokenizer")
tokenizer = HistoricalTextTokenizer()

print(f"tokenize -> {clean_train_split_path}")
train_data, train_labels = tokenizer.process_files(clean_train_split_path)
print(f"succesfully tokenized <- {clean_train_split_path}")

print(f"tokenize -> {clean_test_split_path}")
test_data, test_labels = tokenizer.process_files(clean_test_split_path)
print(f"succesfully tokenized <- {clean_test_split_path}")

print(f"create decade labels")
decades = sorted(set(train_labels + test_labels))
decade_to_label = {decade: i for i, decade in enumerate(decades)}
label_to_decade = {i: decade for i, decade in enumerate(decades)}
print(f"successfully created decades labels")

UNKNOWN = "<unk>"  # Unknown char or unknown word
PADDING_WORD = "<pad>"
id_to_label = [f"decade_{decade}" for decade in decades]

def label_to_id(decade):
    return decade_to_label[decade]

create new tokenizer
tokenize -> ./Datasets/clean_train_split/
Processing decade: 1770
Processing decade: 1780
Processing decade: 1790
Processing decade: 1800
Processing decade: 1810
Processing decade: 1820
Processing decade: 1830
Processing decade: 1840
Processing decade: 1850
Processing decade: 1860
Processing decade: 1870
Processing decade: 1880
Processing decade: 1890
succesfully tokenized <- ./Datasets/clean_train_split/
tokenize -> ./Datasets/clean_test_split
Processing decade: 1770
Processing decade: 1780
Processing decade: 1790
Processing decade: 1800
Processing decade: 1810
Processing decade: 1820
Processing decade: 1830
Processing decade: 1840
Processing decade: 1850
Processing decade: 1860
Processing decade: 1870
Processing decade: 1880
Processing decade: 1890
succesfully tokenized <- ./Datasets/clean_test_split
create decade labels
successfully created decades labels


In [54]:
def load_glove_embeddings(
    embedding_file, padding_word=PADDING_WORD, unknown_word=UNKNOWN
):
    """
    Reads Glove embeddings from a file.

    Returns vector dimensionality, the word_to_id mapping (as a dict),
    and the embeddings (as a list of lists).
    """
    word_to_id = {}  # Dictionary to store word-to-ID mapping
    word_to_id[padding_word] = 0
    word_to_id[unknown_word] = 1
    embeddings = []
    with open(embedding_file, encoding="utf8") as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            embeddings.append(vec)
            word_to_id[word] = len(word_to_id)
    D = len(embeddings[0])

    embeddings.insert(
        word_to_id[padding_word], [0] * D
    )  # <PAD> has an embedding of just zeros
    embeddings.insert(
        word_to_id[unknown_word], [-1] * D
    )  # <UNK> has an embedding of just minus-ones

    return D, word_to_id, embeddings

In [55]:
class HistoricalTextDataset(Dataset):
    """
    A class loading NER dataset from a CSV file to be used as an input
    to PyTorch DataLoader.

    The CSV file has 4 fields: sentence number (only at the start of a new
    sentence), word, POS tag (ignored), and label.

    Datapoints are sentences + associated labels for each word. If the
    words have not been seen before (i.e, they are not found in the
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    """

    def __init__(self, texts, labels, word_to_id, decade_to_label):
        self.texts = texts 
        self.labels = labels 
        self.word_to_id = word_to_id
        self.decade_to_label = decade_to_label

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        decade = self.labels[idx]
        label_id = self.decade_to_label[decade]

        return text, label_id

In [56]:
# Let's check out some of these data structures
dim, word_to_id, embeddings = load_glove_embeddings(glove_6b_50_path)
tokenizer.word2id = word_to_id
tokenizer.id2word = {v: k for k, v in word_to_id.items()}

print("The embedding for the word 'good' looks like this:")
print(embeddings[word_to_id["good"]])
print()

# Read the data we are going to use for testing the model
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label
)
print("There are", len(test_set), "documents in the testset")
dp = 0
text, label = test_set[dp]
print("Document", dp, "starts with:", text[:100], "...")
print("It has the label", label, "which corresponds to decade", label_to_decade[label])

The embedding for the word 'good' looks like this:
[-0.35586, 0.5213, -0.6107, -0.30131, 0.94862, -0.31539, -0.59831, 0.12188, -0.031943, 0.55695, -0.10621, 0.63399, -0.4734, -0.075895, 0.38247, 0.081569, 0.82214, 0.2222, -0.0083764, -0.7662, -0.56253, 0.61759, 0.20292, -0.048598, 0.87815, -1.6549, -0.77418, 0.15435, 0.94823, -0.3952, 3.7302, 0.82855, -0.14104, 0.016395, 0.21115, -0.036085, -0.15587, 0.86583, 0.26309, -0.71015, -0.03677, 0.0018282, -0.17704, 0.27032, 0.11026, 0.14133, -0.057322, 0.27207, 0.31305, 0.92771]

There are 25538 documents in the testset
Document 0 starts with: An Inquiry into the Nature and Causes of the Wealth of Nations by Adam Smith Contents INTRODUCTION A ...
It has the label 0 which corresponds to decade 1770


In [57]:
def pad_sequence_documents(batch, padding_word=PADDING_WORD, max_length=500):
    batch_data, batch_labels = zip(*batch)

    # convert documents to word IDs
    padded_data = []
    for text in batch_data:
        word_ids = tokenizer.tokenize_text_to_id(text)

        # truncate if too long
        if len(word_ids) > max_length:
            word_ids = word_ids[:max_length]

        # pad if too short
        padding_id = word_to_id[padding_word]
        while len(word_ids) < max_length:
            word_ids.append(padding_id)

        padded_data.append(word_ids)

    padded_labels = list(batch_labels)
    return padded_data, padded_labels

In [58]:
x = [(train_data[0], train_labels[0])]
pad_sequence_documents(x)

([[1016,
   23,
   3447,
   3471,
   463,
   2,
   166,
   12,
   2676,
   9,
   2843,
   9,
   6035,
   19797,
   178,
   12259,
   23,
   3471,
   3270,
   16156,
   3,
   156451,
   4,
   175133,
   11,
   1941,
   15,
   2,
   4791,
   5,
   39,
   1,
   47,
   1251,
   21270,
   47,
   2,
   1251,
   21270,
   3,
   124647,
   4,
   2825,
   5282,
   34,
   15070,
   27,
   2,
   5605,
   541,
   3,
   298,
   513,
   3,
   2079,
   14,
   9,
   2321,
   14451,
   30,
   5282,
   29,
   33,
   53,
   297,
   6,
   2,
   914,
   6977,
   4,
   27072,
   47,
   12,
   39,
   1,
   1064,
   5,
   2,
   541,
   3,
   2,
   27072,
   35,
   53,
   8648,
   24,
   2,
   158,
   5,
   2,
   284,
   4,
   15056,
   5,
   2,
   27072,
   33,
   53,
   1167,
   3,
   7,
   238,
   24299,
   16,
   456,
   9,
   3008,
   3006,
   8,
   2,
   685,
   19797,
   1587,
   12259,
   4,
   1048,
   6,
   2,
   2831,
   47,
   1397,
   1471,
   35,
   53,
   2854,
   4,
   12,
   882,
   3,
   2701

In [59]:
class DocumentClassifier(nn.Module):
    def __init__(self, word_embeddings,  # Pre-trained word embeddings
                    word_to_id,             # Mapping from words to ids
                    num_classes,            # Number of decades to classify
                    word_hidden_size=128,   # Hidden size of the RNN (paper uses 128)
                    padding_word=PADDING_WORD,
                    unknown_word=UNKNOWN,
                    word_bidirectional=True,
                    device='cpu'
                ):
        super(DocumentClassifier, self).__init__()
        self.padding_word = padding_word
        self.unknown_word = unknown_word
        self.word_to_id = word_to_id
        self.word_hidden_size = word_hidden_size
        self.device = device
        self.num_classes = num_classes
        self.word_bidirectional = word_bidirectional

        # Create an embedding tensor for the words and import the Glove
        # embeddings. The embeddings are frozen (i.e., they will not be
        # updated during training).
        vocabulary_size = len(word_embeddings)
        self.word_emb_size = len(word_embeddings[0])

        self.word_emb = nn.Embedding(vocabulary_size, self.word_emb_size)
        self.word_emb.weight = nn.Parameter(
            torch.tensor(word_embeddings, dtype=torch.float), requires_grad=False
        )

        self.word_birnn = nn.GRU(
            self.word_emb_size,
            self.word_hidden_size,
            bidirectional=word_bidirectional, 
            batch_first = True
        )

        # Document Classification
        multiplier = 2 if self.word_bidirectional else 1
        self.final_pred = nn.Linear(multiplier * self.word_hidden_size, num_classes)

    def forward(self, x):
        # Shape: (batch_size, seq_length)
        batch_size, seq_length = x.shape
        word_embeddings = self.word_emb(x)
        word_output, word_hidden = self.word_birnn(word_embeddings)
        if self.word_bidirectional:
            forward_final = word_hidden[0]
            backward_final = word_hidden[1]
            complete_doc = torch.cat([forward_final, backward_final], dim=1)
        else:
            complete_doc = word_hidden.squeeze(0)


        logits = self.final_pred(complete_doc)

        return logits

In [60]:
# # ================== Hyper-parameters ==================== #

learning_rate = 0.001
epochs = 5  # Paper uses 5 epochs for RNN
# ======================= Training (First 50 documents) ======================= #

if torch.backends.mps.is_available():
    device = 'mps'
    print("Running on MGPU")
elif torch.cuda.is_available():
    device = 'cuda'
    print("Running on CUDA")
else:
    device = 'cpu'
    print("Running on CPU")

dim, word_to_id, embeddings = load_glove_embeddings(glove_6b_50_path)
tokenizer.word2id = word_to_id
tokenizer.id2word = {v: k for k, v in word_to_id.items()}

# Use only first 50 documents for testing
print("Using first 10000 documents for testing...")
train_data_small = train_data[:10000]
train_labels_small = train_labels[:10000]
test_data_small = test_data[:50]  # First 50 for testing
test_labels_small = test_labels[:50]

training_set = HistoricalTextDataset(train_data_small, train_labels_small, word_to_id, decade_to_label)
test_set = HistoricalTextDataset(test_data_small, test_labels_small, word_to_id, decade_to_label)

training_loader = DataLoader(training_set, batch_size=16, collate_fn=pad_sequence_documents)
test_loader = DataLoader(test_set, batch_size=16, collate_fn=pad_sequence_documents)

print(f"Training on {len(training_set)} documents")
print(f"Testing on {len(test_set)} documents")

birnn_classifier = DocumentClassifier(
    word_embeddings=embeddings,
    word_to_id=word_to_id,
    num_classes=len(decade_to_label),
    word_hidden_size=128, 
    word_bidirectional=True,
    device=device,
).to(device)

optimizer = optim.Adam(birnn_classifier.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
criterion = nn.CrossEntropyLoss()

birnn_classifier.train()
for epoch in range(epochs):
    epoch_loss = 0
    correct = 0 
    total = 0
    
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        
        optimizer.zero_grad()
        logits = birnn_classifier(x)
        loss = criterion(logits, y)
        loss.backward()
        clip_grad_norm_(birnn_classifier.parameters(), 5)
        optimizer.step()
        
        # Track training accuracy
        epoch_loss += loss.item()
        _, predicted = torch.max(logits.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
    
    scheduler.step()
    
    train_acc = 100 * correct / total
    avg_loss = epoch_loss / len(training_loader)
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%")

Running on CUDA
Using first 10000 documents for testing...
Training on 10000 documents
Testing on 50 documents


Epoch 1: 100%|██████████| 625/625 [00:13<00:00, 44.94it/s]


Epoch [1/5], Loss: 0.0362, Train Acc: 99.67%


Epoch 2: 100%|██████████| 625/625 [00:13<00:00, 46.92it/s]


Epoch [2/5], Loss: 0.0000, Train Acc: 100.00%


Epoch 3: 100%|██████████| 625/625 [00:13<00:00, 46.97it/s]


Epoch [3/5], Loss: 0.0000, Train Acc: 100.00%


Epoch 4: 100%|██████████| 625/625 [00:13<00:00, 45.46it/s]


Epoch [4/5], Loss: 0.0000, Train Acc: 100.00%


Epoch 5: 100%|██████████| 625/625 [00:13<00:00, 45.75it/s]

Epoch [5/5], Loss: 0.0000, Train Acc: 100.00%





In [61]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

birnn_classifier.eval()

# Create test dataset and loader
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label)
test_loader = DataLoader(test_set, batch_size=16, collate_fn=pad_sequence_documents)

all_predictions = []
all_labels = []

with torch.no_grad():
    for x, y in test_loader:
        # Convert to tensors
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        pred = torch.argmax(birnn_classifier(x), dim=-1).detach().cpu().numpy()
        y_np = y.detach().cpu().numpy()
        
        all_predictions.extend(pred)
        all_labels.extend(y_np)

# confusion matrix 
num_classes = len(decade_to_label)
confusion_matrix_manual = [[0 for _ in range(num_classes)] for _ in range(num_classes)]

for i in range(len(all_predictions)):
    actual = all_labels[i]
    predicted = all_predictions[i]
    confusion_matrix_manual[actual][predicted] += 1

# Print results
print("Confusion Matrix:")
print("Predicted ->", [f"D{label_to_decade[i]}" for i in range(num_classes)])
for i, row in enumerate(confusion_matrix_manual):
    print(f"Actual D{label_to_decade[i]}: {row}")

accuracy = (confusion_matrix_manual[0][0] + sum(confusion_matrix_manual[i][i] for i in range(num_classes))) / sum(sum(row) for row in confusion_matrix_manual)
print(f"Accuracy: {accuracy:.4f}")

Confusion Matrix:
Predicted -> ['D1770', 'D1780', 'D1790', 'D1800', 'D1810', 'D1820', 'D1830', 'D1840', 'D1850', 'D1860', 'D1870', 'D1880', 'D1890']
Actual D1770: [2100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1780: [810, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1790: [1128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1800: [922, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1810: [1469, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1820: [2462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1830: [1884, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1840: [3776, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1850: [2184, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1860: [2816, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1870: [3480, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1880: [1819, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Actual D1890: [688, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Accuracy: 0.1645


In [62]:
# ================== Hyper-parameters ==================== #
learning_rate = 0.001  
epochs = 5

# ======================= Training ======================= #
if torch.backends.mps.is_available():
    device = "mps"
    print("Running on MGPU")
elif torch.cuda.is_available():
    device = "cuda"
    print("Running on CUDA")
else:
    device = "cpu"
    print("Running on CPU")

dim, word_to_id, embeddings = load_glove_embeddings(glove_6b_300_path)
tokenizer.word2id = word_to_id
tokenizer.id2word = {v: k for k, v in word_to_id.items()}

training_set = HistoricalTextDataset(
    train_data, train_labels, word_to_id, decade_to_label
)
training_loader = DataLoader(
    training_set, batch_size=64, collate_fn=pad_sequence_documents
)

birnn_classifier = DocumentClassifier(
    word_embeddings=embeddings,
    word_to_id=word_to_id,
    num_classes=len(decade_to_label),
    word_hidden_size=128, 
    device=device,
).to(device)

optimizer = optim.Adam(birnn_classifier.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)
criterion = nn.CrossEntropyLoss()

birnn_classifier.train()
for epoch in range(epochs):
    epoch_loss = 0
    correct = 0 
    total = 0
    
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        
        optimizer.zero_grad()
        logits = birnn_classifier(x)
        loss = criterion(logits, y)
        loss.backward()
        clip_grad_norm_(birnn_classifier.parameters(), 5)
        optimizer.step()
        
        # Track training accuracy
        epoch_loss += loss.item()
        _, predicted = torch.max(logits.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
    
    scheduler.step()
    
    train_acc = 100 * correct / total
    avg_loss = epoch_loss / len(training_loader)
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%")

Running on CUDA


Epoch 1: 100%|██████████| 1326/1326 [01:41<00:00, 13.12it/s]


Epoch [1/5], Loss: 0.7107, Train Acc: 90.02%


Epoch 2: 100%|██████████| 1326/1326 [01:38<00:00, 13.40it/s]


Epoch [2/5], Loss: 0.5939, Train Acc: 90.34%


Epoch 3: 100%|██████████| 1326/1326 [01:40<00:00, 13.26it/s]


Epoch [3/5], Loss: 0.7996, Train Acc: 83.85%


Epoch 4: 100%|██████████| 1326/1326 [01:40<00:00, 13.19it/s]


Epoch [4/5], Loss: 0.7103, Train Acc: 83.70%


Epoch 5: 100%|██████████| 1326/1326 [01:40<00:00, 13.23it/s]

Epoch [5/5], Loss: 1.0095, Train Acc: 71.15%





In [63]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

birnn_classifier.eval()

# Create test dataset and loader
test_set = HistoricalTextDataset(test_data, test_labels, word_to_id, decade_to_label)
test_loader = DataLoader(test_set, batch_size=64, collate_fn=pad_sequence_documents)

all_predictions = []
all_labels = []

with torch.no_grad():
    for x, y in test_loader:
        # Convert to tensors
        x = torch.tensor(x, dtype=torch.long).to(device)
        y = torch.tensor(y, dtype=torch.long).to(device)
        pred = torch.argmax(birnn_classifier(x), dim=-1).detach().cpu().numpy()
        y_np = y.detach().cpu().numpy()
        
        all_predictions.extend(pred)
        all_labels.extend(y_np)

# confusion matrix 
num_classes = len(decade_to_label)
confusion_matrix_manual = [[0 for _ in range(num_classes)] for _ in range(num_classes)]

for i in range(len(all_predictions)):
    actual = all_labels[i]
    predicted = all_predictions[i]
    confusion_matrix_manual[actual][predicted] += 1

# Print results
print("Confusion Matrix:")
print("Predicted ->", [f"D{label_to_decade[i]}" for i in range(num_classes)])
for i, row in enumerate(confusion_matrix_manual):
    print(f"Actual D{label_to_decade[i]}: {row}")

accuracy = (confusion_matrix_manual[0][0] + sum(confusion_matrix_manual[i][i] for i in range(num_classes))) / sum(sum(row) for row in confusion_matrix_manual)
print(f"Accuracy: {accuracy:.4f}")

Confusion Matrix:
Predicted -> ['D1770', 'D1780', 'D1790', 'D1800', 'D1810', 'D1820', 'D1830', 'D1840', 'D1850', 'D1860', 'D1870', 'D1880', 'D1890']
Actual D1770: [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 2095]
Actual D1780: [1, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 2, 802]
Actual D1790: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1126]
Actual D1800: [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 4, 914]
Actual D1810: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 6, 1460]
Actual D1820: [6, 1, 2, 0, 0, 0, 3, 0, 11, 12, 15, 15, 2397]
Actual D1830: [2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 4, 1875]
Actual D1840: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 5, 3766]
Actual D1850: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 2179]
Actual D1860: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 3, 2810]
Actual D1870: [2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 8, 3464]
Actual D1880: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 1, 1812]
Actual D1890: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 686]
Accuracy: 0.0271
