# **1**

# 1.1: A Baseline Neural Netowrk Tagger

In [2]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torch.nn as nn
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np

In [3]:
def parse_file(file):
    """
    Parse the TSV file for the POS tagging task.
    Args:
    file (file object): The file object to be read.
    Returns:
    list: A list of sentences, where each sentence is represented as a list of (word, tag) tuples.
    """
    sentences = []
    sentence = []
    for line in file:
        line = line.strip()
        if line:
            word, pos = line.split('\t')
            sentence.append((word, pos))
        else:
            sentences.append(sentence)
            sentence = []

    if sentence:
        sentences.append(sentence)
    return sentences

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        sentences = parse_file(file)
    return sentences

In [5]:
train_sentences = load_data("twpos-train.tsv")
dev_sentences = load_data("twpos-dev.tsv")
devtest_sentences = load_data("twpos-devtest.tsv")

In [6]:
def prepare_data():
    train_data = train_sentences

    word_to_ix = {"UUUNKKK": 0, '<s>':1, '</s>':2}
    tag_to_ix = {}

    # Iterate over each sentence (which is a list of (word, tag) tuples)
    for sentence in train_data:
        for word, tag in sentence:
            if word not in word_to_ix:
                # Add the word to the dictionary if it's not already there
                word_to_ix[word] = len(word_to_ix)

            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    return word_to_ix, tag_to_ix

# Usage
word_to_ix, tag_to_ix = prepare_data()
target_size = len(tag_to_ix)

In [7]:
def transfer_sentence(data, word_to_ix, tag_to_ix, w):
  concat_data = []
  tags = []
  for sentence in data:
      for i in range(len(sentence)):
          current = []
          for j in range(i - w, i + w + 1):
              if j < 0:
                word = '</s>'
              elif j >= len(sentence):
                word = '</s>'
              else:
                word = sentence[j][0]
              if word not in word_to_ix:
                word = 'UUUNKKK'
              current.append(word_to_ix[word])
              if i == j:
                tag = tag_to_ix[sentence[j][1]]
          concat_data.append(current)
          tags.append(tag)
  return np.array(concat_data), np.array(tags)

In [8]:
class POSTagger:
  def __init__(self, w, vocab_size):
    self.w = w
    self.dim_e = 50
    self.dim_h = 128
    self.dim_s = len(tag_to_ix)
    self.vocab_size = len(word_to_ix)
    self.batch_size = 32
    self.epochs = 20

In [9]:
class NN(nn.Module):
    def __init__(self, POSTagger):
        super().__init__()
        self.POSTagger = POSTagger
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e, self.POSTagger.dim_h)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)

        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)

        self.init_weights()

    def init_weights(self):
        initrange = 0.01
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.hidden.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1, (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e)
        #print(embeds.shape)
        hidden_out = self.hidden(embeds)
        hidden_activated = torch.tanh(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, word_to_ix, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]

        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            loss = self.run_grad(x, y)
            train_loss += loss.item()


        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            loss = self.get_loss(x, y)
            test_loss += loss.item()

            log_probs = self.forward(x)
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

            test_loss /= n
            accuracy = (correct / n) * 100
            print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
            return test_loss

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            train_loss = self.one_epoch(i, self.train_data, self.train_label)
            dev_loss = self.test(self.dev_data, self.dev_label)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
        print('\nTesting Result')
        devtest_loss = self.test(self.devtest_data, self.devtest_label)
        return train_loss, dev_loss, devtest_loss

The experiment demonstrated a clear performance improvement when the context window size was increased from w=0 to w=1, highlighting the model's enhanced ability to leverage additional contextual information for more accurate predictions. This suggests that a larger context window is crucial for capturing necessary context and nuances in language processing tasks.

In [10]:
if __name__ == "__main__":
    window_sizes = [0, 1]

    for w in window_sizes:
        print(f"\nTraining with context window size w={w}")
        POST = POSTagger(w=w, vocab_size = len(word_to_ix))
        model = NN(POST)
        model.load_data(train_sentences, dev_sentences, devtest_sentences, word_to_ix, tag_to_ix)
        model.fit()


Training with context window size w=0
Epoch 0, Loss: 2.5841
Loss: 1.9774, Accuracy: 35.4491%
Epoch 1, Loss: 1.5853
Loss: 1.3387, Accuracy: 66.4592%
Epoch 2, Loss: 1.0291
Loss: 1.0984, Accuracy: 74.0095%
Epoch 3, Loss: 0.7097
Loss: 1.1080, Accuracy: 75.2541%
Epoch 4, Loss: 0.5257
Loss: 1.1066, Accuracy: 76.0216%
Epoch 5, Loss: 0.4369
Loss: 1.1484, Accuracy: 76.9965%
Epoch 6, Loss: 0.3771
Loss: 1.1062, Accuracy: 77.2869%
Epoch 7, Loss: 0.3444
Loss: 1.0896, Accuracy: 77.0794%
Epoch 8, Loss: 0.3209
Loss: 1.2372, Accuracy: 76.9342%
Epoch 9, Loss: 0.2997
Loss: 1.2117, Accuracy: 77.1624%
Epoch 10, Loss: 0.2864
Loss: 1.2560, Accuracy: 76.1253%
Epoch 11, Loss: 0.2770
Loss: 1.2357, Accuracy: 77.4321%
Epoch 12, Loss: 0.2673
Loss: 1.1994, Accuracy: 77.4528%
Early stopping due to no improvement.

Testing Result
Loss: 1.0739, Accuracy: 79.4352%

Training with context window size w=1
Epoch 0, Loss: 2.4746
Loss: 1.4940, Accuracy: 55.4242%
Epoch 1, Loss: 1.0119
Loss: 0.9146, Accuracy: 73.7191%
Epoch 2

# 1.2 Feature Engineering

Upon reviewing misclassified center words from section 1.1, it was observed that inaccuracies often involved words beginning with capital letters, containing special characters, or including numbers. To address this, new features were designed to precisely identify such traits within the words. Incorporating these nuanced features enhanced the model's understanding of the data, leading to a noticeable improvement in accuracy compared to the results in section 1.1.

In [11]:
import string
def extract_features(word):
    features = [
        float(any(char.isdigit() for char in word)),  # contains digits
        float(word.isdigit()),
        float(any(char in string.punctuation for char in word)),  # contains punctuation
        word[0] in string.punctuation,
        len(word)
    ]
    return features

In [12]:
def transfer_sentence_features(sentences, w):
  feature_matrix = []
  for sentence in sentences:
      for i in range(len(sentence)):
          for j in range(i - w, i + w + 1):
                if i == j:
                  features = extract_features(sentence[j][0])
                  feature_matrix.append(features)
  return np.array(feature_matrix)

In [13]:
class NN_feature(nn.Module):
    def __init__(self, POSTagger):
        super().__init__()
        self.POSTagger = POSTagger
        self.num_features = 5
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e  + self.num_features, self.POSTagger.dim_h)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)
        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)
        self.init_weights()

    def init_weights(self):
        initrange = 0.01
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.hidden.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        inputs, features = inputs
        embeds = self.embeddings(inputs).view(-1, ((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e))
        combined = torch.cat((embeds, features), 1)
        hidden_out = self.hidden(combined)
        hidden_activated = torch.tanh(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, word_to_ix, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.train_features = transfer_sentence_features(train_sentence, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.dev_features = transfer_sentence_features(dev_sentence, self.POSTagger.w)

        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.devtest_features = transfer_sentence_features(devtest_sentence, self.POSTagger.w)

        # Convert the feature data from NumPy arrays to tensors
        self.train_features = torch.tensor(self.train_features, dtype=torch.float)
        self.dev_features = torch.tensor(self.dev_features, dtype=torch.float)
        self.devtest_features = torch.tensor(self.devtest_features, dtype=torch.float)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label, features):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]
        feature_s = features[idx]
        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            feature_batch = feature_s[i:i + self.POSTagger.batch_size].clone().detach()

            loss = self.run_grad((x, feature_batch), y)
            train_loss += loss.item()

        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label, features):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            feature_batch = features.clone().detach()
            loss = self.get_loss((x, feature_batch), y)
            test_loss += loss.item()

            log_probs = self.forward((x,feature_batch))
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

        test_loss /= n
        accuracy = (correct / n) * 100
        print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
        return test_loss

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            train_loss = self.one_epoch(i, self.train_data, self.train_label, self.train_features)
            dev_loss = self.test(self.dev_data, self.dev_label, self.dev_features)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
        print('\nTesting Result')
        devtest_loss = self.test(self.devtest_data, self.devtest_label, self.devtest_features)
        return train_loss, dev_loss, devtest_loss

In [14]:
if __name__ == "__main__":
    window_sizes = [0, 1]

    for w in window_sizes:
        print(f"\nTraining with context window size w={w}")
        POST = POSTagger(w=w, vocab_size = len(word_to_ix))
        model = NN_feature(POST)
        model.load_data(train_sentences, dev_sentences, devtest_sentences, word_to_ix, tag_to_ix)
        model.fit()


Training with context window size w=0
Epoch 0, Loss: 1.9440
Loss: 1.4169, Accuracy: 51.2342%
Epoch 1, Loss: 1.2304
Loss: 1.0155, Accuracy: 66.8741%
Epoch 2, Loss: 0.9468
Loss: 0.8914, Accuracy: 74.5281%
Epoch 3, Loss: 0.7549
Loss: 0.9036, Accuracy: 72.6613%
Epoch 4, Loss: 0.6064
Loss: 0.9844, Accuracy: 76.6646%
Epoch 5, Loss: 0.5284
Loss: 0.9027, Accuracy: 77.9091%
Epoch 6, Loss: 0.4545
Loss: 1.0288, Accuracy: 75.4615%
Epoch 7, Loss: 0.3969
Loss: 0.8957, Accuracy: 79.1122%
Early stopping due to no improvement.

Testing Result
Loss: 0.8610, Accuracy: 79.3706%

Training with context window size w=1
Epoch 0, Loss: 1.9276
Loss: 1.2637, Accuracy: 60.4854%
Epoch 1, Loss: 1.1189
Loss: 1.0785, Accuracy: 67.2682%
Epoch 2, Loss: 0.8164
Loss: 1.1181, Accuracy: 69.8818%
Epoch 3, Loss: 0.6465
Loss: 0.7750, Accuracy: 78.5522%
Epoch 4, Loss: 0.5083
Loss: 0.7405, Accuracy: 80.5227%
Epoch 5, Loss: 0.4136
Loss: 0.7404, Accuracy: 80.6472%
Epoch 6, Loss: 0.3367
Loss: 0.7324, Accuracy: 80.7716%
Epoch 7, L

# 1.3 Pretrained Embeddings

1) Experiment with updating the pretrained embeddings for w = 0 and w = 1

* Setting up the train an embedding for '&lt;s&gt;' by using the embedding for '&lt;s&gt;'

In [15]:
import pandas as pd
def load_embeddings(embedding_file):
    embeddings = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
      lines = f.readlines()
      for line in lines:
        words = line.split()
        embeddings[words[0]] = words[1:]
        if words[0] == '<\s>':
          embeddings['<s>'] = words[1:]
    return pd.DataFrame.from_dict(embeddings, orient = 'index').astype('float32')

In [16]:
embedding_df = load_embeddings('/content/twitter-embeddings.txt')

In [17]:
twitter_to_index = {word: i for i, word in enumerate(embedding_df.index)}

In [18]:
class POSTagger_embed:
  def __init__(self, w, vocab_size):
    self.w = w
    self.dim_e = 50
    self.dim_h = 128
    self.dim_s = len(tag_to_ix)
    self.vocab_size = len(twitter_to_index)
    self.batch_size = 32
    self.epochs = 20

In [19]:
class NN_embed(nn.Module):
    def __init__(self, POSTagger, pertrained_embeddings, twitter_to_index):
        super().__init__()
        self.POSTagger = POSTagger
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e, self.POSTagger.dim_h)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)

        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)

        # Initialize embeddings with pretrained vectors
        self.init_embeddings(pretrained_embeddings, twitter_to_index)

    def init_embeddings(self, pretrained_embeddings, twitter_to_index):
        initrange = 0.01
        self.embeddings.weight.data = pretrained_embeddings
        self.hidden.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1, (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e)
        #print(embeds.shape)
        hidden_out = self.hidden(embeds)
        hidden_activated = torch.tanh(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, twitter_to_index, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]

        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            loss = self.run_grad(x, y)
            train_loss += loss.item()
        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            loss = self.get_loss(x, y)
            test_loss += loss.item()

            log_probs = self.forward(x)
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

            test_loss /= n
            accuracy = (correct / n) * 100
            print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
            return test_loss, accuracy

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            print('Epoch', i)
            train_loss = self.one_epoch(i, self.train_data, self.train_label)
            dev_loss, dev_accuracy = self.test(self.dev_data, self.dev_label)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
            print('------------------------------------')
        print('\nTesting Result')
        devtest_loss, devtest_accuracy = self.test(self.devtest_data, self.devtest_label)
        return train_loss, dev_loss, devtest_loss, dev_accuracy, devtest_accuracy

The experiment demonstrated accuracy improvements for both w=0 and w=1 settings when using pre-trained embeddings, as opposed to randomly-initialized word embeddings. This underscores the superiority of adopting pre-trained embeddings for enhanced model performance.

In [None]:
if __name__ == "__main__":
    window_sizes = [0, 1]
    pretrained_embeddings = torch.tensor(embedding_df.values)
    for w in window_sizes:
        print(f"\nTraining with context window size w={w}")
        POST = POSTagger_embed(w=w, vocab_size = len(embedding_df))
        model = NN_embed(POST, pretrained_embeddings, twitter_to_index)
        model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)
        model.fit()


Training with context window size w=0
Epoch 0
Epoch 0, Loss: 1.0297
Loss: 0.6024, Accuracy: 83.0741%
------------------------------------
Epoch 1
Epoch 1, Loss: 0.5149
Loss: 0.5584, Accuracy: 83.2607%
------------------------------------
Epoch 2
Epoch 2, Loss: 0.4381
Loss: 0.5671, Accuracy: 82.9081%
------------------------------------
Epoch 3
Epoch 3, Loss: 0.4112
Loss: 0.5521, Accuracy: 83.9038%
------------------------------------
Epoch 4
Epoch 4, Loss: 0.3913
Loss: 0.5605, Accuracy: 82.8666%
------------------------------------
Epoch 5
Epoch 5, Loss: 0.3755
Loss: 0.5554, Accuracy: 82.0369%
------------------------------------
Epoch 6
Epoch 6, Loss: 0.3674
Loss: 0.5815, Accuracy: 82.9911%
------------------------------------
Epoch 7
Epoch 7, Loss: 0.3611
Loss: 0.5542, Accuracy: 84.0075%
------------------------------------
Epoch 8
Epoch 8, Loss: 0.3547
Loss: 0.5628, Accuracy: 83.3230%
Early stopping due to no improvement.

Testing Result
Loss: 0.5226, Accuracy: 83.9405%

Training w

2) Comparing the result for updating the pretrained word embeddings during training and keeping them fixed

In [None]:
class NN_embed_freeze(nn.Module):
    def __init__(self, POSTagger, pretrained_embeddings, twitter_to_index, freeze_embeddings):
        super().__init__()
        self.POSTagger = POSTagger
        self.freeze_embeddings = freeze_embeddings
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e, self.POSTagger.dim_h)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)

        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=0.02)

        # Initialize embeddings with pretrained vectors
        self.init_embeddings(pretrained_embeddings)

    def init_embeddings(self, pretrained_embeddings):
        initrange = 0.01
        self.embeddings.weight.data = pretrained_embeddings
        self.embeddings.weight.requires_grad = not self.freeze_embeddings
        self.hidden.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1, (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e)

        hidden_out = self.hidden(embeds)
        hidden_activated = torch.tanh(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, twitter_to_index, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]

        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            loss = self.run_grad(x, y)
            train_loss += loss.item()
        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            loss = self.get_loss(x, y)
            test_loss += loss.item()

            log_probs = self.forward(x)
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

            test_loss /= n
            accuracy = (correct / n) * 100
            print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
            return test_loss

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            print('Epoch', i)
            train_loss = self.one_epoch(i, self.train_data, self.train_label)
            dev_loss = self.test(self.dev_data, self.dev_label)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break

            print('------------------------------------------')

        print('\nTesting Result')
        devtest_loss = self.test(self.devtest_data, self.devtest_label)
        return train_loss, dev_loss, devtest_loss

During training with a context window of w=1, updating pre-trained word embeddings has shown distinct benefits, evidenced by a modest yet noteworthy improvement of approximately 2% in accuracy compared to keeping them fixed. This enhancement underscores the effectiveness of allowing the embeddings to evolve with training.

In [None]:
if __name__ == "__main__":
    freeze = [True, False]
    pretrained_embeddings = torch.tensor(embedding_df.values)
    for f in freeze:
        print(f"\nParameter Freeze = {f}")
        POST = POSTagger_embed(w = 1, vocab_size = len(embedding_df))
        model = NN_embed_freeze(POST, pretrained_embeddings, twitter_to_index, f)
        model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)
        model.fit()


Parameter Freeze = True
Epoch 0
Epoch 0, Loss: 1.0185
Loss: 0.6198, Accuracy: 82.5762%
------------------------------------------
Epoch 1
Epoch 1, Loss: 0.5551
Loss: 0.5629, Accuracy: 84.4638%
------------------------------------------
Epoch 2
Epoch 2, Loss: 0.5009
Loss: 0.5363, Accuracy: 84.6712%
------------------------------------------
Epoch 3
Epoch 3, Loss: 0.4689
Loss: 0.5310, Accuracy: 84.6505%
------------------------------------------
Epoch 4
Epoch 4, Loss: 0.4453
Loss: 0.4982, Accuracy: 85.3350%
------------------------------------------
Epoch 5
Epoch 5, Loss: 0.4238
Loss: 0.5020, Accuracy: 86.2892%
------------------------------------------
Epoch 6
Epoch 6, Loss: 0.4062
Loss: 0.5099, Accuracy: 84.8164%
------------------------------------------
Epoch 7
Epoch 7, Loss: 0.3886
Loss: 0.4973, Accuracy: 85.7084%
------------------------------------------
Epoch 8
Epoch 8, Loss: 0.3689
Loss: 0.4916, Accuracy: 86.2684%
------------------------------------------
Epoch 9
Epoch 9, Loss

3) Feature Engineering for pretrained embeddings model

In [None]:
class NN_embed_feature(nn.Module):
    def __init__(self, POSTagger, pertrained_embeddings, twitter_to_index):
        super().__init__()
        self.POSTagger = POSTagger
        self.num_features = 5
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e  + self.num_features, self.POSTagger.dim_h)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)
        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)

        # Initialize embeddings with pretrained vectors
        self.init_embeddings(pretrained_embeddings, twitter_to_index)

    def init_embeddings(self, pretrained_embeddings, twitter_to_index):
        initrange = 0.01
        self.embeddings.weight.data = pretrained_embeddings
        self.hidden.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)
    def forward(self, inputs):
        inputs, features = inputs
        embeds = self.embeddings(inputs).view(-1, ((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e))
        combined = torch.cat((embeds, features), 1)
        hidden_out = self.hidden(combined)
        hidden_activated = torch.tanh(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, word_to_ix, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.train_features = transfer_sentence_features(train_sentence, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.dev_features = transfer_sentence_features(dev_sentence, self.POSTagger.w)

        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, word_to_ix, tag_to_ix, self.POSTagger.w)
        self.devtest_features = transfer_sentence_features(devtest_sentence, self.POSTagger.w)

        # Convert the feature data from NumPy arrays to tensors
        self.train_features = torch.tensor(self.train_features, dtype=torch.float)
        self.dev_features = torch.tensor(self.dev_features, dtype=torch.float)
        self.devtest_features = torch.tensor(self.devtest_features, dtype=torch.float)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label, features):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]
        feature_s = features[idx]
        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            feature_batch = feature_s[i:i + self.POSTagger.batch_size].clone().detach()
            loss = self.run_grad((x, feature_batch), y)
            train_loss += loss.item()

        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label, features):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            feature_batch = features.clone().detach()
            loss = self.get_loss((x, feature_batch), y)
            test_loss += loss.item()

            log_probs = self.forward((x,feature_batch))
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

        test_loss /= n
        accuracy = (correct / n) * 100
        print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
        return test_loss

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            print('Epoch', i)
            train_loss = self.one_epoch(i, self.train_data, self.train_label, self.train_features)
            dev_loss = self.test(self.dev_data, self.dev_label, self.dev_features)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
            print('------------------------')
        print('\nTesting Result')
        devtest_loss = self.test(self.devtest_data, self.devtest_label, self.devtest_features)
        return train_loss, dev_loss, devtest_loss

Combining the custom features from Section 1.2 with pre-trained embeddings reveals that these features still offer assistance. Although the improvement is subtle, it highlights that features can contribute additional context that pretraining might not capture, thereby providing a slight edge in the model's performance.

In [None]:
if __name__ == "__main__":
    window_sizes = [0, 1]
    pretrained_embeddings = torch.tensor(embedding_df.values)
    for w in window_sizes:
        print(f"\nTraining with context window size w={w}")
        POST = POSTagger_embed(w=w, vocab_size = len(embedding_df))
        model = NN_embed_feature(POST, pretrained_embeddings, twitter_to_index)
        model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)
        model.fit()


Training with context window size w=0
Epoch 0
Epoch 0, Loss: 1.2516
Loss: 0.6053, Accuracy: 83.0533%
------------------------
Epoch 1
Epoch 1, Loss: 0.6132
Loss: 0.6372, Accuracy: 82.0577%
------------------------
Epoch 2
Epoch 2, Loss: 0.5005
Loss: 0.5374, Accuracy: 85.2520%
------------------------
Epoch 3
Epoch 3, Loss: 0.4427
Loss: 0.6016, Accuracy: 83.0533%
------------------------
Epoch 4
Epoch 4, Loss: 0.3948
Loss: 0.5673, Accuracy: 84.0904%
------------------------
Epoch 5
Epoch 5, Loss: 0.3778
Loss: 0.5583, Accuracy: 83.7586%
------------------------
Epoch 6
Epoch 6, Loss: 0.3566
Loss: 0.6181, Accuracy: 83.5096%
------------------------
Epoch 7
Epoch 7, Loss: 0.3525
Loss: 0.5672, Accuracy: 84.3808%
Early stopping due to no improvement.

Testing Result
Loss: 0.5316, Accuracy: 84.4794%

Training with context window size w=1
Epoch 0
Epoch 0, Loss: 0.7329
Loss: 0.5072, Accuracy: 86.6833%
------------------------
Epoch 1
Epoch 1, Loss: 0.3052
Loss: 0.5249, Accuracy: 86.1854%
-----

# 1.4 Architecture Engineering

**I got the best test accuracy when window size = 1 and no hidden layer with tanh function.**

1) Compare the use of 0, 1, and 2 hidden layers

In [None]:
class NN_layers(nn.Module):
    def __init__(self, POSTagger, pertrained_embeddings, twitter_to_index, hidden_dim1=None, hidden_dim2=None):
        super().__init__()
        self.POSTagger = POSTagger
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)
        self.hidden1 = None
        self.hidden2 = None

        input_dim = (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e

        if hidden_dim1:
            self.hidden1 = nn.Linear(input_dim, hidden_dim1)
            output_dim = hidden_dim1

            if hidden_dim2:
                self.hidden2 = nn.Linear(hidden_dim1, hidden_dim2)
                output_dim = hidden_dim2
        else:
            output_dim = input_dim
        self.output = nn.Linear(output_dim, self.POSTagger.dim_s)
        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)

        # Initialize embeddings with pretrained vectors
        self.init_embeddings(pretrained_embeddings, twitter_to_index)

    def init_embeddings(self, pretrained_embeddings, twitter_to_index):
        initrange = 0.01
        self.embeddings.weight.data = pretrained_embeddings
        if self.hidden1 is not None:
          self.hidden1.weight.data.uniform_(-initrange, initrange)
        if self.hidden2 is not None:
          self.hidden2.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1, (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e)
        if self.hidden1 is not None:
          hidden_out = self.hidden1(embeds)
          if self.hidden2 is not None:
            hidden_out = self.hidden2(hidden_out)
        else:
          hidden_out = embeds
        hidden_activated = torch.tanh(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, twitter_to_index, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]

        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            loss = self.run_grad(x, y)
            train_loss += loss.item()

        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            loss = self.get_loss(x, y)
            test_loss += loss.item()

            log_probs = self.forward(x)
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

            test_loss /= n
            accuracy = (correct / n) * 100
            print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
            return test_loss, accuracy

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            print('Epoch', i)
            train_loss = self.one_epoch(i, self.train_data, self.train_label)
            dev_loss, dev_accuracy = self.test(self.dev_data, self.dev_label)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
            print('---------------------------------------------')
        print('\n Testing Result')
        devtest_loss, devtest_accuracy = self.test(self.devtest_data, self.devtest_label)
        return train_loss, dev_loss, devtest_loss, dev_accuracy, devtest_accuracy

In [None]:
configs = [
    # 0 hidden layers
   {'hidden_layers': 0, 'hidden_dim1': None, 'hidden_dim2': None},

    # 1 hidden layer, two setups
    {'hidden_layers': 1, 'hidden_dim1': 256, 'hidden_dim2': None},
    {'hidden_layers': 1, 'hidden_dim1': 512, 'hidden_dim2': None},

    # 2 hidden layers, two setups
    {'hidden_layers': 2, 'hidden_dim1': 256, 'hidden_dim2': 128},
    {'hidden_layers': 2, 'hidden_dim1': 512, 'hidden_dim2': 256}
]

# Adding window sizes to test
window_sizes = [0, 1]

# This will be used to collect all results
all_results = []

for window in window_sizes:
    results = []  # This collects results for the current window size

    for config in configs:
        print(f"Training configuration: {config}, Window size: {window}")

        # Initialize the model with the current configuration
        pretrained_embeddings = torch.tensor(embedding_df.values)
        POST = POSTagger_embed(w=window, vocab_size=len(embedding_df))
        model = NN_layers(POST, pretrained_embeddings, twitter_to_index,
                          hidden_dim1=config['hidden_dim1'],
                          hidden_dim2=config['hidden_dim2'])
        model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)

        result = model.fit()

        if isinstance(result, tuple):
            result = {
                'train_loss': result[0],
                'dev_loss': result[1],
                'devtest_loss': result[2],
                'dev_accuracy': result[3],
                'devtest_accuracy': result[4]
            }

        result['window_size'] = window
        result.update(config)
        results.append(result)
    results_df = pd.DataFrame(results)
    print(f"Results for window size {window}:\n", results_df)
    all_results.extend(results)
all_results_df = pd.DataFrame(all_results)
print("Complete results:\n", all_results_df)

Training configuration: {'hidden_layers': 0, 'hidden_dim1': None, 'hidden_dim2': None}, Window size: 0
Epoch 0
Epoch 0, Loss: 0.9956
Loss: 0.6288, Accuracy: 82.4933%
---------------------------------------------
Epoch 1
Epoch 1, Loss: 0.5289
Loss: 0.5461, Accuracy: 83.7793%
---------------------------------------------
Epoch 2
Epoch 2, Loss: 0.4469
Loss: 0.5349, Accuracy: 83.9038%
---------------------------------------------
Epoch 3
Epoch 3, Loss: 0.4000
Loss: 0.5340, Accuracy: 84.4638%
---------------------------------------------
Epoch 4
Epoch 4, Loss: 0.3757
Loss: 0.5239, Accuracy: 84.0697%
---------------------------------------------
Epoch 5
Epoch 5, Loss: 0.3556
Loss: 0.5308, Accuracy: 83.8415%
---------------------------------------------
Epoch 6
Epoch 6, Loss: 0.3457
Loss: 0.5314, Accuracy: 83.9660%
---------------------------------------------
Epoch 7
Epoch 7, Loss: 0.3396
Loss: 0.5453, Accuracy: 84.2771%
---------------------------------------------
Epoch 8
Epoch 8, Loss: 0.

Both for the window sizes 0 and 1, the test accuracy decreased as the model increased the number of layers and neurons in the layers.

In [None]:
all_results_df

Unnamed: 0,train_loss,dev_loss,devtest_loss,dev_accuracy,devtest_accuracy,window_size,hidden_layers,hidden_dim1,hidden_dim2
0,0.330638,0.547994,0.500398,83.488903,83.811166,0,0,,
1,0.355798,0.572516,0.526975,83.322962,83.746497,0,1,256.0,
2,0.341285,0.560836,0.511553,83.281477,83.681828,0,1,512.0,
3,0.333919,0.681067,0.629749,81.953951,82.884242,0,2,256.0,128.0
4,0.381044,0.615315,0.550612,81.352417,82.798017,0,2,512.0,256.0
5,0.16339,0.45063,0.408758,87.699647,88.445786,1,0,,
6,0.146835,0.585115,0.537446,86.455092,87.001509,1,1,256.0,
7,0.148623,0.614594,0.551444,86.206181,86.893727,1,1,512.0,
8,0.196667,0.687148,0.648792,84.546775,85.708127,1,2,256.0,128.0
9,0.179426,0.687404,0.660889,84.422319,85.471007,1,2,512.0,256.0


2) Experiment with different nonlinearities(Identity, tanh, ReLU, Sigmoid) with number of hidden layers = 1, hidden layer of width = 128

In [None]:
class NN_activation(nn.Module):
    def __init__(self, POSTagger, pertrained_embeddings, twitter_to_index, activation_function):
        super().__init__()
        self.POSTagger = POSTagger
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e, self.POSTagger.dim_h)
        self.output = nn.Linear(self.POSTagger.dim_h, self.POSTagger.dim_s)

        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)

        # Initialize embeddings with pretrained vectors
        self.init_embeddings(pretrained_embeddings)

        # Set the activation function
        if activation_function == 'relu':
            self.activation = nn.ReLU()
        elif activation_function == 'sigmoid':
            self.activation = nn.Sigmoid()
        elif activation_function == 'identity':
            self.activation = lambda x: x
        else:
            self.activation = nn.Tanh()  # Default is tanh

    def init_embeddings(self, pretrained_embeddings):
        initrange = 0.01
        self.embeddings.weight.data = pretrained_embeddings
        self.hidden.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1, (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e)
        hidden_out = self.hidden(embeds)
        hidden_activated = self.activation(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, twitter_to_index, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]

        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            loss = self.run_grad(x, y)
            train_loss += loss.item()

        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            loss = self.get_loss(x, y)
            test_loss += loss.item()

            log_probs = self.forward(x)
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

            test_loss /= n
            accuracy = (correct / n) * 100
            print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
            return test_loss, accuracy

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            print('Epoch', i)
            train_loss = self.one_epoch(i, self.train_data, self.train_label)
            dev_loss, dev_accuracy = self.test(self.dev_data, self.dev_label)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
            print('------------------------------------')
        print('\nTesting Result')
        devtest_loss, devtest_accuracy = self.test(self.devtest_data, self.devtest_label)
        return train_loss, dev_loss, devtest_loss, dev_accuracy, devtest_accuracy

In [None]:
configs = []

# Define configurations based on window sizes and activation functions
window_sizes = [0, 1]
activation_functions = ['identity', 'tanh', 'relu', 'sigmoid']

for w in window_sizes:
    for activation in activation_functions:
        configs.append({
            'window_size': w,
            'activation_function': activation
        })

results = []

for config in configs:
    print(f"\nTraining with context window size w={config['window_size']} and {config['activation_function']} activation function")

    pretrained_embeddings = torch.tensor(embedding_df.values)

    POST = POSTagger_embed(w=config['window_size'], vocab_size=len(embedding_df))
    model = NN_activation(POST, pretrained_embeddings, twitter_to_index, activation_function=config['activation_function'])

    model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)

    result = model.fit()

    if isinstance(result, tuple):
        result = {
            'train_loss': result[0],
            'dev_loss': result[1],
            'devtest_loss': result[2],
            'dev_accuracy': result[3],
            'devtest_accuracy': result[4]
        }

    result.update(config)

    results.append(result)

results_df = pd.DataFrame(results)

print(results_df)



Training with context window size w=0 and identity activation function
Epoch 0
Epoch 0, Loss: 1.0206
Loss: 0.6005, Accuracy: 82.7837%
------------------------------------
Epoch 1
Epoch 1, Loss: 0.5117
Loss: 0.6419, Accuracy: 82.8459%
------------------------------------
Epoch 2
Epoch 2, Loss: 0.4417
Loss: 0.5462, Accuracy: 84.2771%
------------------------------------
Epoch 3
Epoch 3, Loss: 0.4072
Loss: 0.5897, Accuracy: 82.0784%
------------------------------------
Epoch 4
Epoch 4, Loss: 0.3897
Loss: 0.6058, Accuracy: 83.9245%
------------------------------------
Epoch 5
Epoch 5, Loss: 0.3784
Loss: 0.6099, Accuracy: 80.8961%
------------------------------------
Epoch 6
Epoch 6, Loss: 0.3696
Loss: 0.5690, Accuracy: 82.6177%
------------------------------------
Epoch 7
Epoch 7, Loss: 0.3658
Loss: 0.5713, Accuracy: 82.6177%
Early stopping due to no improvement.

Testing Result
Loss: 0.5394, Accuracy: 82.7980%

Training with context window size w=0 and tanh activation function
Epoch 0
Ep

Overly, ReLU or Sigmoid produce the highest test accuracy, but it doesn't differ that much.

In [None]:
results_df

Unnamed: 0,train_loss,dev_loss,devtest_loss,dev_accuracy,devtest_accuracy,window_size,activation_function
0,0.365771,0.571287,0.53936,82.617714,82.798017,0,identity
1,0.352529,0.570157,0.515562,83.405932,84.220737,0,tanh
2,0.329943,0.584074,0.540747,83.737814,83.423152,0,relu
3,0.32341,0.601395,0.549057,83.654843,84.350075,0,sigmoid
4,0.15998,0.599824,0.557334,86.185439,86.829058,1,identity
5,0.124933,0.629154,0.586767,86.372122,86.527269,1,tanh
6,0.122173,0.591303,0.547208,86.309894,87.087734,1,relu
7,0.161495,0.573327,0.51684,86.538063,87.777538,1,sigmoid


3) Experiment with w = 2 and compare the results to w = 0 and 1

In [None]:
configs = [{'window_size': w} for w in [0, 1, 2]]

results = []

for config in configs:
    print(f"\nTraining with context window size w={config['window_size']}")
    pretrained_embeddings = torch.tensor(embedding_df.values)
    POST = POSTagger_embed(w=config['window_size'], vocab_size=len(embedding_df))
    model = NN_embed(POST, pretrained_embeddings, twitter_to_index)
    model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)
    result = model.fit()

    if isinstance(result, tuple):
        result = {
            'train_loss': result[0],
            'dev_loss': result[1],
            'devtest_loss': result[2],
            'dev_accuracy': result[3],
            'devtest_accuracy': result[4]
        }

    result.update(config)

    results.append(result)

results_df = pd.DataFrame(results)

results_df



Training with context window size w=0
Epoch 0
Epoch 0, Loss: 1.0211
Loss: 0.5987, Accuracy: 83.0948%
------------------------------------
Epoch 1
Epoch 1, Loss: 0.5131
Loss: 0.5488, Accuracy: 82.9496%
------------------------------------
Epoch 2
Epoch 2, Loss: 0.4440
Loss: 0.5785, Accuracy: 83.2192%
------------------------------------
Epoch 3
Epoch 3, Loss: 0.4084
Loss: 0.5443, Accuracy: 83.3437%
------------------------------------
Epoch 4
Epoch 4, Loss: 0.3885
Loss: 0.6181, Accuracy: 81.9747%
------------------------------------
Epoch 5
Epoch 5, Loss: 0.3764
Loss: 0.5634, Accuracy: 84.0904%
------------------------------------
Epoch 6
Epoch 6, Loss: 0.3690
Loss: 0.5886, Accuracy: 83.7793%
------------------------------------
Epoch 7
Epoch 7, Loss: 0.3647
Loss: 0.5692, Accuracy: 83.4267%
------------------------------------
Epoch 8
Epoch 8, Loss: 0.3579
Loss: 0.5508, Accuracy: 83.7586%
Early stopping due to no improvement.

Testing Result
Loss: 0.5141, Accuracy: 84.1345%

Training w

Unnamed: 0,train_loss,dev_loss,devtest_loss,dev_accuracy,devtest_accuracy,window_size
0,0.357889,0.550782,0.514058,83.758556,84.134512,0
1,0.147316,0.565908,0.528085,86.558805,87.238629,1
2,0.092491,0.676776,0.611927,85.563161,86.69972,2


Window size 1 produces the best test accuracy, and we can also see that the bigger window size does not make the test accuracy higher.


In [None]:
results_df

Unnamed: 0,train_loss,dev_loss,devtest_loss,dev_accuracy,devtest_accuracy,window_size
0,0.357889,0.550782,0.514058,83.758556,84.134512,0
1,0.147316,0.565908,0.528085,86.558805,87.238629,1
2,0.092491,0.676776,0.611927,85.563161,86.69972,2


# 1.5 RNN Taggers

In [None]:
class POSTagger_Additional:
  def __init__(self, w, vocab_size):
    self.w = w
    self.dim_e = 50
    self.dim_h = 512
    self.dim_s = len(tag_to_ix)
    self.vocab_size = len(twitter_to_index)
    self.batch_size = 32
    self.epochs = 20

In [None]:
class NN_Additional(nn.Module):
    def __init__(self, POSTagger, pertrained_embeddings, twitter_to_index, rnn_type, bidirectional):
        super().__init__()
        self.POSTagger = POSTagger
        self.embeddings = nn.Embedding(self.POSTagger.vocab_size, self.POSTagger.dim_e)
        self.hidden = nn.Linear((2 * self.POSTagger.w + 1) * self.POSTagger.dim_e, self.POSTagger.dim_h)
        direction_factor = 2 if bidirectional else 1
        self.output = nn.Linear(self.POSTagger.dim_h * direction_factor, self.POSTagger.dim_s)
        rnn_class = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.rnn = rnn_class(self.POSTagger.dim_e * (2 * self.POSTagger.w + 1), self.POSTagger.dim_h, batch_first=True, bidirectional=bidirectional)
        # Output layer

        self.dropout_first = nn.Dropout(0.5)

        self.loss_function = F.cross_entropy
        self.optimizer = optim.SGD(self.parameters(), lr=0.02)

        # Initialize embeddings with pretrained vectors
        self.init_embeddings(pretrained_embeddings, twitter_to_index)

    def init_embeddings(self, pretrained_embeddings, twitter_to_index):
        """
        Initialize the embedding layer with pretrained embeddings.
        Words not found in the pretrained list will be initialized with the UUUNKKK vector.
        """
        initrange = 0.01
        self.embeddings.weight.data = pretrained_embeddings
     #   self.rnn.weight.data.uniform_(-initrange, initrange)
        self.output.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(-1, (2 * self.POSTagger.w + 1) * self.POSTagger.dim_e)
        embeds = self.dropout_first(embeds)
        hidden_out = self.rnn(embeds)
        hidden_out, _ = self.rnn(embeds)
        hidden_activated = F.relu(hidden_out)
        tag_space = self.output(hidden_activated)
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

    def load_data(self, train_sentence, dev_sentence, devtest_sentence, twitter_to_index, tag_to_ix):
        self.train_data, self.train_label = transfer_sentence(train_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.dev_data, self.dev_label = transfer_sentence(dev_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)
        self.devtest_data, self.devtest_label = transfer_sentence(devtest_sentence, twitter_to_index, tag_to_ix, self.POSTagger.w)

    def get_loss(self, x, y):
        log_prob = self.forward(x)
        loss = self.loss_function(log_prob, y, reduction='sum')
        return loss

    def run_grad(self, x, y):
        loss = self.get_loss(x, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def one_epoch(self, epoch, sentence, label):
        n = sentence.shape[0]
        idx = np.arange(0, n)
        np.random.shuffle(idx)

        sentence_s = sentence[idx]
        label_s = label[idx]

        train_loss = 0
        for i in range(0, n, self.POSTagger.batch_size):
            x = torch.tensor(sentence_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            y = torch.tensor(label_s[i:i + self.POSTagger.batch_size], dtype=torch.long)
            loss = self.run_grad(x, y)
            train_loss += loss.item()
        train_loss /= n
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}')
        return train_loss

    def test(self, sentence, label):
        self.eval()
        n = sentence.shape[0]
        correct = 0

        with torch.no_grad():
            test_loss = 0
            x = torch.tensor(sentence, dtype=torch.long)
            y = torch.tensor(label, dtype=torch.long)
            loss = self.get_loss(x, y)
            test_loss += loss.item()

            log_probs = self.forward(x)
            _, predicted = torch.max(log_probs.data, 1)
            correct += (y == predicted).sum().item()

            test_loss /= n
            accuracy = (correct / n) * 100
            print(f"Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}%")
            return test_loss, accuracy

    def fit(self):
        best_dev_loss = np.inf
        epochs_without_improvement = 0
        patience = 5

        for i in range(self.POSTagger.epochs):
            print('Epoch', i)
            train_loss = self.one_epoch(i, self.train_data, self.train_label)
            dev_loss, dev_accuracy = self.test(self.dev_data, self.dev_label)

            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= patience:
                    print("Early stopping due to no improvement.")
                    break
            print('------------------------------------')
        print('\nTesting Result')
        devtest_loss, devtest_accuracy = self.test(self.devtest_data, self.devtest_label)
        return train_loss, dev_loss, devtest_loss, dev_accuracy, devtest_accuracy

In [None]:
POST = POSTagger_Additional(w=2, vocab_size = len(embedding_df))
def train_and_evaluate_models(configurations):
    results = []

    for config in configurations:
        rnn_type, bidirectional = config['rnn_type'], config['bidirectional']
        print(f"\nTraining model with RNN type: {rnn_type} | Bidirectional: {str(bidirectional)}")

        model = NN_Additional(POST, pretrained_embeddings, twitter_to_index, rnn_type=rnn_type, bidirectional=bidirectional)

        model.load_data(train_sentences, dev_sentences, devtest_sentences, twitter_to_index, tag_to_ix)

        train_loss, dev_loss, devtest_loss, dev_accuracy, devtest_accuracy = model.fit()

        results.append({
            'rnn_type': rnn_type,
            'bidirectional': bidirectional,
            'train_loss': train_loss,
            'dev_loss': dev_loss,
            'devtest_loss': devtest_loss,
            'dev_accuracy': dev_accuracy,
            'devtest_accuracy': devtest_accuracy
        })

    return pd.DataFrame(results)

configurations = [
    {'rnn_type': 'LSTM', 'bidirectional': True},
    {'rnn_type': 'RNN', 'bidirectional': True},
    {'rnn_type': 'GRU', 'bidirectional': True},
    {'rnn_type': 'LSTM', 'bidirectional': False},
    {'rnn_type': 'RNN', 'bidirectional': False},
    {'rnn_type': 'GRU', 'bidirectional': False}
]

results_df = train_and_evaluate_models(configurations)

results_df


Training model with RNN type: LSTM | Bidirectional: True
Epoch 0
Epoch 0, Loss: 2.5168
Loss: 2.4098, Accuracy: 32.2755%
------------------------------------
Epoch 1
Epoch 1, Loss: 2.3367
Loss: 2.3991, Accuracy: 37.8967%
------------------------------------
Epoch 2
Epoch 2, Loss: 2.2384
Loss: 2.4541, Accuracy: 34.3912%
------------------------------------
Epoch 3
Epoch 3, Loss: 2.2067
Loss: 2.3564, Accuracy: 35.3868%
------------------------------------
Epoch 4
Epoch 4, Loss: 2.2297
Loss: 2.4090, Accuracy: 34.1008%
------------------------------------
Epoch 5


KeyboardInterrupt: ignored