In [1]:
# First run this cell
import csv
from tqdm import tqdm
import string
import codecs
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

In [2]:
import nltk
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("Downloading punkt_tab...")
    nltk.download('punkt_tab')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading punkt...")
    nltk.download('punkt')

from nltk.tokenize import word_tokenize, sent_tokenize

Downloading punkt_tab...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Downloading punkt...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

def advanced_tokenize(text):
    import traceback
    try:
        sentences = sent_tokenize(text)
    except Exception as e:
        print("Error during sent_tokenize:")
        traceback.print_exc()
        raise e  # Re-raise to see full trace

    all_tokens = []
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        filtered_tokens = [
            token for token in tokens
            if token.isalnum() or token in ["'s", "'t", "'re", "'ve", "'ll", "'d", "n't", '!', '?', '.', ',', ';', ':', '--', '—']
        ]
        all_tokens.extend(filtered_tokens)
    return all_tokens


In [4]:
# Labels for each text
start_year = 1700
nb_decades = 20
def int_to_decades(t):
    return start_year + 10*t

In [5]:
from google.colab import drive
drive.mount('/content/drive')
glove_path = '/content/drive/MyDrive/Assignment 4/Project/glove.6B.50d.txt'
classifier_train_path = '/content/drive/MyDrive/Assignment 4/Project/dd2417-dating-historical-texts/snippets_train_dataset.csv'
classifier_test_path = '/content/drive/MyDrive/Assignment 4/Project/dd2417-dating-historical-texts/snippets_test_dataset.csv'

Mounted at /content/drive


In [6]:
# Run this cell to init mappings from characters to IDs and back again,
# from words to IDs and back again, and from labels to IDs and back again

UNKNOWN = '<unk>'  # Unknown char or unknown word
CHARS = [UNKNOWN, '’', '—'] + list(string.punctuation) + list(string.ascii_letters) + list(string.digits)
char_to_id = {c:i for i,c in enumerate(CHARS)}
PADDING_WORD = '<pad>'
id_to_label = ['{}'.format(int_to_decades(d)) for d in range(nb_decades)]

def label_to_id(label):
    return (int(label)-start_year)//10

print(CHARS)
print(id_to_label)

['<unk>', '’', '—', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['1700', '1710', '1720', '1730', '1740', '1750', '1760', '1770', '1780', '1790', '1800', '1810', '1820', '1830', '1840', '1850', '1860', '1870', '1880', '1890']


In [7]:
def load_glove_embeddings(embedding_file,
                          padding_word=PADDING_WORD,
                          unknown_word=UNKNOWN):
    """
    Reads Glove embeddings from a file.

    Returns vector dimensionality, the word_to_id mapping (as a dict),
    and the embeddings (as a list of lists).
    """
    word_to_id = {}  # Dictionary to store word-to-ID mapping
    word_to_id[padding_word] = 0
    word_to_id[unknown_word] = 1
    embeddings = []
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            embeddings.append(vec)
            word_to_id[word] = len(word_to_id)
    D = len(embeddings[0])

    embeddings.insert(word_to_id[padding_word], [0]*D)  # <PAD> has an embedding of just zeros
    embeddings.insert(word_to_id[unknown_word], [-1]*D)      # <UNK> has an embedding of just minus-ones

    return D, word_to_id, embeddings

In [57]:
class HistDataset(Dataset):
    """
    A class loading a dataset from a CSV file to be used as an input
    to PyTorch DataLoader.

    The CSV file has 2 fields: chunk of text and label.

    Datapoints are sentences + associated labels for each word. If the
    words have not been seen before (i.e, they are not found in the
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    """

    def __init__(self, filename, word_to_id):
        reader = csv.reader(codecs.open(filename, encoding='utf8',
                                        errors='ignore'), delimiter='\t')

        self.passages = []
        self.labels = []

        for (i,row) in enumerate(reader):
            if row and len(row) >= 2:
                text = row[0].strip().lower()
                # Tokenize using NLTK
                tokens = advanced_tokenize(text)
                if len(tokens) > 0:  # Only add non-empty passages
                    self.passages.append(tokens)
                    self.labels.append(label_to_id(row[1].strip()))


    def __len__(self):
        return len(self.passages)

    def __getitem__(self, idx):
        return self.passages[idx], self.labels[idx]

In [9]:
# Let's check out some of these data structures
dim, word_to_id, embeddings = load_glove_embeddings(glove_path)
print("The embedding for the word 'good' looks like this:")
print(embeddings[word_to_id['good']])
print()

The embedding for the word 'good' looks like this:
[-0.35586, 0.5213, -0.6107, -0.30131, 0.94862, -0.31539, -0.59831, 0.12188, -0.031943, 0.55695, -0.10621, 0.63399, -0.4734, -0.075895, 0.38247, 0.081569, 0.82214, 0.2222, -0.0083764, -0.7662, -0.56253, 0.61759, 0.20292, -0.048598, 0.87815, -1.6549, -0.77418, 0.15435, 0.94823, -0.3952, 3.7302, 0.82855, -0.14104, 0.016395, 0.21115, -0.036085, -0.15587, 0.86583, 0.26309, -0.71015, -0.03677, 0.0018282, -0.17704, 0.27032, 0.11026, 0.14133, -0.057322, 0.27207, 0.31305, 0.92771]



In [27]:
# Read the data we are going to use for testing the model
test_set = HistDataset(classifier_train_path, word_to_id)
print("There are", len(test_set), "data points in the test set")

for dp in range(2):
    sentence, label = test_set[dp]
    print("Data point", dp, "is", sentence)
    print("It has the label", label)

There are 8200 data points in the test set
Data point 0 is ['the', 'professor', 'laid', 'his', 'hand', 'tenderly', 'on', 'his', 'shoulder', 'as', 'he', 'spoke', ':', '--', 'ah', ',', 'my', 'child', ',', 'i', 'will', 'be', 'plain', '.', 'do', 'you', 'not', 'see', 'how', ',', 'of', 'late', ',', 'this', 'monster', 'has', 'been', 'creeping', 'into', 'knowledge', 'experimentally', '.', 'how', 'he', 'has', 'been', 'making', 'use', 'of', 'the', 'zoöphagous', 'patient', 'to', 'effect', 'his', 'entry', 'into', 'friend', 'john', 's', 'home', ';', 'for', 'your', 'vampire', ',', 'though', 'in', 'all', 'afterwards', 'he', 'can', 'come', 'when', 'and', 'how', 'he', 'will', ',', 'must', 'at', 'the', 'first', 'make', 'entry', 'only', 'when', 'asked', 'thereto', 'by', 'an', 'inmate', '.', 'but', 'these', 'are', 'not', 'his', 'most', 'important', 'experiments', '.', 'do', 'we', 'not', 'see', 'how', 'at', 'the', 'first', 'all', 'these', 'so', 'great', 'boxes', 'were']
It has the label 19
Data point 1 is 

In [11]:
# Run this cell. The function below will take care of the case of
# sequences of unequal lengths.

def pad_sequence(batch, padding_word=PADDING_WORD):
    batch_data, batch_labels = zip(*batch)
    max_len = max(map(len, batch_data))
    padded_data = [[b[i] if i < len(b) else padding_word for i in range(max_len)] for b in batch_data]
    return padded_data, batch_labels

In [12]:
# This is how it works
x = [([1,2,3],1750), ([4,5],1890), ([6,7,8,9],1900)]
pad_sequence(x)

([[1, 2, 3, '<pad>'], [4, 5, '<pad>', '<pad>'], [6, 7, 8, 9]],
 (1750, 1890, 1900))

In [79]:
class HistClassifier(nn.Module):

    def __init__(self, word_embeddings,  # Pre-trained word embeddings
                 char_to_id,             # Mapping from chars to ids
                 word_to_id,             # Mapping from words to ids
                 char_emb_size=16,
                 char_hidden_size=25,    # Hidden size of the character-level biRNN
                 word_hidden_size=50,   # Hidden size of the word-level biRNN
                 class_size=nb_decades,
                 padding_word=PADDING_WORD,
                 unknown_word=UNKNOWN,
                 char_bidirectional=True,
                 word_bidirectional=True,
                 device='cpu'
            ):

        super(HistClassifier, self).__init__()
        self.padding_word = padding_word
        self.unknown_word = unknown_word
        self.char_to_id = char_to_id
        self.word_to_id = word_to_id
        self.char_emb_size = char_emb_size
        self.char_hidden_size = char_hidden_size
        self.word_hidden_size = word_hidden_size
        self.class_size = class_size
        self.char_bidirectional = char_bidirectional
        self.word_bidirectional = word_bidirectional
        self.device = device


        self.dropout_emb = nn.Dropout(0.3)

        # Create an embedding tensor for the words and import the Glove
        # embeddings. The embeddings are frozen (i.e., they will not be
        # updated during training).
        vocabulary_size = len(word_embeddings)
        self.word_emb_size = len(word_embeddings[0])
        self.word_emb = nn.Embedding(vocabulary_size, self.word_emb_size)
        self.word_emb.weight = nn.Parameter(torch.tensor(word_embeddings, dtype=torch.float), requires_grad=False)

        # Create an embedding tensor for character embeddings. These embeddings
        # are learnt from scratch (i.e., they are not frozen).
        if self.char_emb_size > 0:
            self.char_emb = nn.Embedding(len(char_to_id), char_emb_size)
            self.char_lstm = nn.LSTM(
                self.char_emb_size,
                self.char_hidden_size,
                bidirectional=char_bidirectional,
                batch_first=True
            )
        else:
            self.char_hidden_size = 0

        # Calculate multipliers for concatenation
        char_multiplier = 2 if self.char_bidirectional else 1
        word_multiplier = 2 if self.word_bidirectional else 1

        self.word_lstm = nn.LSTM(
            self.word_emb_size + char_multiplier*self.char_hidden_size, # input size
            self.word_hidden_size,
            bidirectional=self.word_bidirectional,
            batch_first=True,
            num_layers=2,
            dropout=0.3
        )

        # FIXED: Use word_multiplier for the classifier input size
        # and remove the redundant self.final_pred
        self.classifier = nn.Sequential(
          nn.Linear(word_multiplier * word_hidden_size, word_hidden_size),
          nn.ReLU(),
          nn.Dropout(0.5),
          nn.Linear(word_hidden_size, word_hidden_size // 2),
          nn.ReLU(),
          nn.Dropout(0.5),
          nn.Linear(word_hidden_size // 2, self.class_size)
      )

    def forward(self, x):
        """
        Performs a forward pass of a historical text classifier
        Takes as input a 2D list `x` of dimensionality (B, T),
        where B is the batch size;
              T is the max sentence length in the batch (shorter sentences
              are already padded with the special token <PAD>)

        Returns logits, i.e. the output of the last linear layer before applying softmax.

        :param      x:    A batch of sentences
        :type       x:    list of strings
        """
        lengths = torch.tensor([len([w for w in sent if w != self.padding_word]) for sent in x]).to(self.device)

        # Convert words and characters to IDs
        word_ids = [[self.word_to_id.get(word.lower(), 1) for word in sentence] for sentence in x]
        char_ids = [[[self.char_to_id.get(char, 0) for char in word] for word in sentence] for sentence in x]

        # FIXED: Handle edge case where there might be empty sequences
        if not char_ids or not any(char_ids):
            max_word_len = 1
        else:
            max_word_len = max(len(word) for sentence in char_ids for word in sentence if word)
            if max_word_len == 0:
                max_word_len = 1

        # Pad character sequences
        char_ids = [[[word[i] if i < len(word) else 0 for i in range(max_word_len)] for word in sentence] for sentence in char_ids]

        # Convert to tensors
        word_tensor = torch.tensor(word_ids).to(self.device)
        char_tensor = torch.tensor(char_ids).to(self.device)

        # Dataset parameters
        batch_size, len_sentence = word_tensor.shape

        # Embedding layer
        E_w = self.dropout_emb(self.word_emb(word_tensor))
        E_c = self.char_emb(char_tensor)
        E_c = E_c.reshape(batch_size * len_sentence, max_word_len, self.char_emb_size)


        # Character-level LSTM
        char_output, (hidden_char, _) = self.char_lstm(E_c)

        # FIXED: Handle bidirectional hidden states properly
        if self.char_bidirectional:
            # hidden_char shape: (2, batch_size*len_sentence, char_hidden_size)
            H_c = torch.cat([hidden_char[0], hidden_char[1]], dim=1)
        else:
            # hidden_char shape: (1, batch_size*len_sentence, char_hidden_size)
            H_c = hidden_char[0]

        # Reshape back to (batch_size, len_sentence, char_features)
        char_feature_size = 2 * self.char_hidden_size if self.char_bidirectional else self.char_hidden_size
        H_c = H_c.reshape(batch_size, len_sentence, char_feature_size)

        # Combine word and character embeddings
        E_final = torch.cat([E_w, H_c], dim=2)
        packed_input = nn.utils.rnn.pack_padded_sequence(E_final, lengths.cpu(), batch_first=True, enforce_sorted=False)

        # Word-level LSTM
        outputs, (last_hidden, _) = self.word_lstm(packed_input)

        # FIXED: Handle bidirectional word LSTM hidden states properly
        if self.word_bidirectional:
            # For bidirectional LSTM with 2 layers, last_hidden shape: (4, batch_size, word_hidden_size)
            # Take the last layer's hidden states from both directions
            output = torch.cat([last_hidden[-2], last_hidden[-1]], dim=1)
        else:
            # For unidirectional LSTM with 2 layers, take the last layer's hidden state
            output = last_hidden[-1]

        # Classification
        final_pred = self.classifier(output)

        return final_pred






In [81]:
# ================== Hyper-parameters ==================== #

learning_rate = 0.001
epochs = 100

# ======================= Training ======================= #

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

#dim, word_to_id, embeddings = load_glove_embeddings('../glove.6B.50d.txt')
training_set = HistDataset(classifier_train_path, word_to_id)
training_loader = DataLoader(training_set, batch_size=64, shuffle=True, collate_fn=pad_sequence)

classifier = HistClassifier(embeddings, char_to_id, word_to_id, device=device).to(device)

Running on cuda


In [82]:
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

classifier.train()
for epoch in range(epochs):
    total_loss = 0
    for x, y in tqdm(training_loader, desc="Epoch {}".format(epoch + 1)):
        optimizer.zero_grad()
        logits = classifier(x)
        loss = criterion(logits, torch.tensor(y).to(device).reshape(-1,))
        loss.backward()
        total_loss += loss.item()
        clip_grad_norm_(classifier.parameters(), 5)
        optimizer.step()
    print(f"Loss: {total_loss:.4f}")

Epoch 1: 100%|██████████| 129/129 [00:15<00:00,  8.46it/s]


Loss: 386.2519


Epoch 2: 100%|██████████| 129/129 [00:15<00:00,  8.19it/s]


Loss: 385.0543


Epoch 3: 100%|██████████| 129/129 [00:15<00:00,  8.57it/s]


Loss: 384.9170


Epoch 4: 100%|██████████| 129/129 [00:15<00:00,  8.27it/s]


Loss: 384.0536


Epoch 5: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 382.3711


Epoch 6: 100%|██████████| 129/129 [00:15<00:00,  8.54it/s]


Loss: 378.2294


Epoch 7: 100%|██████████| 129/129 [00:15<00:00,  8.19it/s]


Loss: 374.6010


Epoch 8: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 371.2416


Epoch 9: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 366.9270


Epoch 10: 100%|██████████| 129/129 [00:15<00:00,  8.49it/s]


Loss: 364.3125


Epoch 11: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 359.5554


Epoch 12: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 355.8918


Epoch 13: 100%|██████████| 129/129 [00:15<00:00,  8.25it/s]


Loss: 353.7472


Epoch 14: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 348.1777


Epoch 15: 100%|██████████| 129/129 [00:15<00:00,  8.55it/s]


Loss: 345.4296


Epoch 16: 100%|██████████| 129/129 [00:15<00:00,  8.22it/s]


Loss: 342.0737


Epoch 17: 100%|██████████| 129/129 [00:15<00:00,  8.49it/s]


Loss: 338.4531


Epoch 18: 100%|██████████| 129/129 [00:15<00:00,  8.19it/s]


Loss: 336.3966


Epoch 19: 100%|██████████| 129/129 [00:15<00:00,  8.47it/s]


Loss: 332.1182


Epoch 20: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 331.3256


Epoch 21: 100%|██████████| 129/129 [00:15<00:00,  8.22it/s]


Loss: 327.5611


Epoch 22: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 325.4431


Epoch 23: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 321.3551


Epoch 24: 100%|██████████| 129/129 [00:15<00:00,  8.53it/s]


Loss: 318.6644


Epoch 25: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 317.0971


Epoch 26: 100%|██████████| 129/129 [00:15<00:00,  8.22it/s]


Loss: 312.9547


Epoch 27: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 310.8117


Epoch 28: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 307.1870


Epoch 29: 100%|██████████| 129/129 [00:15<00:00,  8.47it/s]


Loss: 306.2448


Epoch 30: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 305.1751


Epoch 31: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 299.6174


Epoch 32: 100%|██████████| 129/129 [00:15<00:00,  8.49it/s]


Loss: 297.4863


Epoch 33: 100%|██████████| 129/129 [00:15<00:00,  8.23it/s]


Loss: 297.0125


Epoch 34: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 294.7358


Epoch 35: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 290.0112


Epoch 36: 100%|██████████| 129/129 [00:15<00:00,  8.24it/s]


Loss: 285.9037


Epoch 37: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 284.0209


Epoch 38: 100%|██████████| 129/129 [00:15<00:00,  8.48it/s]


Loss: 282.7312


Epoch 39: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 279.8940


Epoch 40: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 279.1130


Epoch 41: 100%|██████████| 129/129 [00:15<00:00,  8.24it/s]


Loss: 277.2122


Epoch 42: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 273.0657


Epoch 43: 100%|██████████| 129/129 [00:15<00:00,  8.23it/s]


Loss: 272.4010


Epoch 44: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 268.7295


Epoch 45: 100%|██████████| 129/129 [00:15<00:00,  8.48it/s]


Loss: 267.7940


Epoch 46: 100%|██████████| 129/129 [00:15<00:00,  8.18it/s]


Loss: 263.7736


Epoch 47: 100%|██████████| 129/129 [00:15<00:00,  8.54it/s]


Loss: 261.7142


Epoch 48: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 260.9500


Epoch 49: 100%|██████████| 129/129 [00:15<00:00,  8.48it/s]


Loss: 259.2012


Epoch 50: 100%|██████████| 129/129 [00:15<00:00,  8.25it/s]


Loss: 257.6468


Epoch 51: 100%|██████████| 129/129 [00:15<00:00,  8.56it/s]


Loss: 257.3569


Epoch 52: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 252.7549


Epoch 53: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 251.9414


Epoch 54: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 248.8932


Epoch 55: 100%|██████████| 129/129 [00:15<00:00,  8.23it/s]


Loss: 245.1404


Epoch 56: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 245.1230


Epoch 57: 100%|██████████| 129/129 [00:15<00:00,  8.53it/s]


Loss: 243.7072


Epoch 58: 100%|██████████| 129/129 [00:15<00:00,  8.22it/s]


Loss: 242.4360


Epoch 59: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 239.6443


Epoch 60: 100%|██████████| 129/129 [00:15<00:00,  8.19it/s]


Loss: 237.5423


Epoch 61: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 235.1635


Epoch 62: 100%|██████████| 129/129 [00:15<00:00,  8.22it/s]


Loss: 233.0146


Epoch 63: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 230.7299


Epoch 64: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 229.8914


Epoch 65: 100%|██████████| 129/129 [00:15<00:00,  8.19it/s]


Loss: 229.5525


Epoch 66: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 227.6091


Epoch 67: 100%|██████████| 129/129 [00:15<00:00,  8.16it/s]


Loss: 226.9641


Epoch 68: 100%|██████████| 129/129 [00:15<00:00,  8.52it/s]


Loss: 224.5944


Epoch 69: 100%|██████████| 129/129 [00:15<00:00,  8.48it/s]


Loss: 222.5255


Epoch 70: 100%|██████████| 129/129 [00:15<00:00,  8.23it/s]


Loss: 221.5047


Epoch 71: 100%|██████████| 129/129 [00:15<00:00,  8.55it/s]


Loss: 219.0213


Epoch 72: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 217.2341


Epoch 73: 100%|██████████| 129/129 [00:15<00:00,  8.54it/s]


Loss: 218.7350


Epoch 74: 100%|██████████| 129/129 [00:15<00:00,  8.55it/s]


Loss: 216.3097


Epoch 75: 100%|██████████| 129/129 [00:15<00:00,  8.24it/s]


Loss: 214.0941


Epoch 76: 100%|██████████| 129/129 [00:15<00:00,  8.54it/s]


Loss: 210.8741


Epoch 77: 100%|██████████| 129/129 [00:15<00:00,  8.26it/s]


Loss: 211.9816


Epoch 78: 100%|██████████| 129/129 [00:15<00:00,  8.57it/s]


Loss: 211.8122


Epoch 79: 100%|██████████| 129/129 [00:15<00:00,  8.57it/s]


Loss: 205.2301


Epoch 80: 100%|██████████| 129/129 [00:15<00:00,  8.27it/s]


Loss: 205.9664


Epoch 81: 100%|██████████| 129/129 [00:15<00:00,  8.56it/s]


Loss: 203.4029


Epoch 82: 100%|██████████| 129/129 [00:15<00:00,  8.31it/s]


Loss: 204.1882


Epoch 83: 100%|██████████| 129/129 [00:14<00:00,  8.62it/s]


Loss: 202.0442


Epoch 84: 100%|██████████| 129/129 [00:15<00:00,  8.57it/s]


Loss: 201.0340


Epoch 85: 100%|██████████| 129/129 [00:15<00:00,  8.21it/s]


Loss: 201.2506


Epoch 86: 100%|██████████| 129/129 [00:15<00:00,  8.59it/s]


Loss: 198.7230


Epoch 87: 100%|██████████| 129/129 [00:15<00:00,  8.28it/s]


Loss: 195.7750


Epoch 88: 100%|██████████| 129/129 [00:15<00:00,  8.56it/s]


Loss: 192.5218


Epoch 89: 100%|██████████| 129/129 [00:15<00:00,  8.57it/s]


Loss: 196.2773


Epoch 90: 100%|██████████| 129/129 [00:15<00:00,  8.24it/s]


Loss: 191.4359


Epoch 91: 100%|██████████| 129/129 [00:15<00:00,  8.53it/s]


Loss: 193.9970


Epoch 92: 100%|██████████| 129/129 [00:15<00:00,  8.51it/s]


Loss: 189.3530


Epoch 93: 100%|██████████| 129/129 [00:15<00:00,  8.25it/s]


Loss: 189.5010


Epoch 94: 100%|██████████| 129/129 [00:15<00:00,  8.59it/s]


Loss: 187.7943


Epoch 95: 100%|██████████| 129/129 [00:15<00:00,  8.28it/s]


Loss: 188.0697


Epoch 96: 100%|██████████| 129/129 [00:15<00:00,  8.49it/s]


Loss: 184.0514


Epoch 97: 100%|██████████| 129/129 [00:15<00:00,  8.23it/s]


Loss: 183.5609


Epoch 98: 100%|██████████| 129/129 [00:15<00:00,  8.50it/s]


Loss: 182.1858


Epoch 99: 100%|██████████| 129/129 [00:15<00:00,  8.19it/s]


Loss: 181.6327


Epoch 100: 100%|██████████| 129/129 [00:15<00:00,  8.55it/s]

Loss: 182.0184





In [83]:
# Save the model
save_path = "/content/drive/MyDrive/Assignment 4/Project/dd2417-dating-historical-texts/hist_classifier-100-epochs.pt"
torch.save(classifier.state_dict(), save_path)

In [84]:
# Re-create the model
classifier2 = HistClassifier(embeddings, char_to_id, word_to_id, device=device).to(device)
load_path = "/content/drive/MyDrive/Assignment 4/Project/dd2417-dating-historical-texts/hist_classifier-100-epochs.pt"
classifier2.load_state_dict(torch.load(load_path, map_location=device))

<All keys matched successfully>

In [87]:
# Evaluation
import numpy as np
!pip install terminaltables
from terminaltables import AsciiTable

classifier.eval()

confusion_matrix = np.zeros((nb_decades,nb_decades))
test_set = HistDataset(classifier_train_path, word_to_id) ### TO CHANGE WITH TRAINING FILE
for x, y in test_set:
    pred = torch.argmax(classifier([x]), dim=-1).item()
    confusion_matrix[y,pred]+=1



In [88]:
# Create header row: predicted class names
header = [''] + [f'Predicted {i}' for i in range(nb_decades)]

# Create each row: true class + predicted counts
table = [header]
for i in range(nb_decades):
    row = [f'Real {i}'] + list(confusion_matrix[i,:])
    table.append(row)

t = AsciiTable(table)
print(t.table)
correct = np.trace(confusion_matrix)               # sum of diagonal
total = np.sum(confusion_matrix)                  # sum of all values
accuracy = round(correct / total, 4)

print("Accuracy: {}".format(accuracy))

+---------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
|         | Predicted 0 | Predicted 1 | Predicted 2 | Predicted 3 | Predicted 4 | Predicted 5 | Predicted 6 | Predicted 7 | Predicted 8 | Predicted 9 | Predicted 10 | Predicted 11 | Predicted 12 | Predicted 13 | Predicted 14 | Predicted 15 | Predicted 16 | Predicted 17 | Predicted 18 | Predicted 19 |
+---------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Real 0  | 189.0       | 83.0        | 8.0         | 84.0        | 8.0         | 2.0         

300.0
