In [1]:
import numpy as np
import pandas as pd
import datasets
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import pandas as pd 
import random 
import warnings
import os 

from sklearn.metrics import classification_report

warnings.filterwarnings('ignore')

In [2]:
# Loading data
data_path = "./data"

train_df = pd.read_csv(os.path.join(data_path, "train_data.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test_data.csv"))

train_texts, train_labels = train_df.text.tolist(), train_df.label.tolist()
test_texts, test_labels = test_df.text.tolist(), test_df.label.tolist()

# Simple whitespace split
tok_train_texts = [[w for w in txt.split() if w != ""] for txt in train_texts]
tok_test_texts = [[w for w in txt.split() if w != ""] for txt in test_texts]

In [None]:
# Courtesy for the code to https://github.com/bentrevett/pytorch-sentiment-analysis?tab=readme-ov-file

# Hyperparameters for dataset
max_length = 200
min_freq = 5
batch_size = 16

special_tokens = ["<unk>", "<pad>"]
mapping = {0: 0, 1: 1, -1: 2}

def tokenize(example, max_length):
    # Simple whitespace-tokenization
    tokens = [word for word in example['text'].split() if word != ""][:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}

def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

# Transform datasets
train_df = pd.DataFrame({"text": train_texts, "label": train_labels}) 
test_df = pd.DataFrame({"text": test_texts, "label": test_labels}) 

# Re-map labels to avoid errors
train_df['label'] = train_df['label'].map(mapping)
test_df['label'] = test_df['label'].map(mapping)

# Obtain HF datasets
train_ds = datasets.Dataset.from_pandas(train_df)
test_ds = datasets.Dataset.from_dict(test_df)

# Map tokenization
train_ds = train_ds.map(
        tokenize, fn_kwargs={"max_length": max_length}
)

test_ds = test_ds.map(
        tokenize, fn_kwargs={"max_length": max_length}
)

vocab = torchtext.vocab.build_vocab_from_iterator(
    train_ds["tokens"],
    min_freq=min_freq,
    specials=special_tokens
)

unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]
vocab.set_default_index(unk_index)

train_ds = train_ds.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_ds = test_ds.map(numericalize_example, fn_kwargs={"vocab": vocab})

train_data = train_ds.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_ds.with_format(type="torch", columns=["ids", "label", "length"])

In [182]:
# Craete nested function to avoid writing a class
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = []
        batch_length = []
        batch_label = []

        for sample in batch:
            batch_ids.append(sample["ids"])
            batch_length.append(sample["length"])
            batch_label.append(sample['label'])
        
        # Padding to the size of largest sequence in the batch
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
        return batch
    return collate_fn


def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader       

In [183]:
train_data_loader = get_data_loader(train_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [184]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, bidir, dropout, pad_index,):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_index)
        # Straightforward implementation of an RNN 
        self.rnn = nn.RNN(
            embed_dim,
            hidden_dim,
            n_layers,
            bidirectional=bidir,
            dropout=dropout,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidir else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, ids, length):
        embedded = self.dropout(self.embedding(ids))
        
        # Padded/packed sequences for better efficiency
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, length, batch_first=True, enforce_sorted=False
        )
        _, hidden = self.rnn(packed_embedded)  
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
        else:
            hidden = self.dropout(hidden[-1])
            
        prediction = self.fc(hidden)

        return prediction

In [185]:
def get_f1_scores(prediction, golden_label):
    predicted_classes = prediction.argmax(dim=-1)
    report = classification_report(golden_label.cpu(), predicted_classes.cpu(), output_dict=True)
    macro_f1 = report['macro avg']['f1-score']
    micro_f1 = report['weighted avg']['f1-score']
    return macro_f1, micro_f1 


def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_macro_f1s = []
    epoch_micro_f1s = []
    
    for batch in tqdm.tqdm(dataloader):
        ids = batch["ids"].to(device)
        length = batch["length"]
        label = batch["label"].to(device)

        prediction = model(ids, length)
        loss = criterion(prediction, label)
        macro_f1, micro_f1 = get_f1_scores(prediction, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_losses.append(loss.item())
        epoch_macro_f1s.append(macro_f1)
        epoch_micro_f1s.append(micro_f1)
        
    return np.mean(epoch_losses), np.mean(epoch_macro_f1s), np.mean(epoch_micro_f1s)


def evaluate_full(dataloader, model, device):
    model.eval()
    predictions = []
    golden_labels = []

    with torch.no_grad():
        for batch in dataloader:
            ids = batch["ids"].to(device)
            length = batch["length"]
            label = batch["label"].to(device) 

            prediction = model(ids, length)
            predicted_classes = prediction.argmax(dim=-1)
            
            predictions.extend(predicted_classes.tolist())
            golden_labels.extend(label.tolist())

    print(classification_report(golden_labels, predictions))


def evaluate_batched_f1_scores(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_macro_f1s = []
    epoch_micro_f1s = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader):
            ids = batch["ids"].to(device)
            length = batch["length"]
            label = batch["label"].to(device)

            prediction = model(ids, length)
            loss = criterion(prediction, label)
            
            macro_f1, micro_f1 = get_f1_scores(prediction.cpu(), label.cpu())

            epoch_losses.append(loss.item())
            epoch_macro_f1s.append(macro_f1)
            epoch_micro_f1s.append(micro_f1)
    return np.mean(epoch_losses), np.mean(epoch_macro_f1s), np.mean(epoch_micro_f1s)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [186]:
def seed_torch(seed=8):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch()

In [187]:
# Model/train hyperparameters
vocab_size = len(vocab)

embed_dim = 300
hidden_dim = 256
output_dim = len(train_data.unique("label"))

n_layers = 1  # rnn layers
bidir = True  # bidirectional
dropout = 0.3
lr = 5e-4

model = RNN(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, bidir, dropout, pad_index)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vectors = torchtext.vocab.GloVe(name="840B", dim=300) # GloVE embeddings
pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding

model = model.to(device)
criterion = criterion.to(device)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 421,335 trainable parameters


In [188]:
n_epochs = 4
best_macro_f1 = 0.0
best_micro_f1 = 0.0

for epoch in range(n_epochs):
    print(f"Epoch: {epoch}")
    train_loss, train_macro_f1, train_micro_f1 = train(
        train_data_loader, model, criterion, optimizer, device
    )

    test_loss, test_macro_f1, test_micro_f1 = evaluate_batched_f1_scores(test_data_loader, model, criterion, device)

    # The model with best test metrics will be saved in current directory as "rnn.pt"
    if test_macro_f1 > best_macro_f1 and test_micro_f1 > best_micro_f1:
        best_macro_f1 = test_macro_f1
        best_micro_f1 = test_micro_f1
        torch.save(model.state_dict(), "rnn.pt")
        print("Saving model...")

    print(f"Train loss: {train_loss:.3f}, Train macro f1: {train_macro_f1:.3f}, Train micro f1: {train_micro_f1:.3f}")
    print(f"Test_loss: {test_loss:.3f}, Test macro f1: {test_macro_f1:.3f}, Test micro f1: {test_micro_f1:.3f}", end="\n\n")

Epoch: 0


100%|██████████| 13/13 [00:00<00:00, 57.89it/s]
100%|██████████| 4/4 [00:00<00:00, 107.71it/s]


Saving model...
Train loss: 1.068, Train macro f1: 0.309, Train micro f1: 0.361
Test_loss: 1.016, Test macro f1: 0.501, Test micro f1: 0.532

Epoch: 1


100%|██████████| 13/13 [00:00<00:00, 53.13it/s]
100%|██████████| 4/4 [00:00<00:00, 132.02it/s]


Train loss: 0.885, Train macro f1: 0.556, Train micro f1: 0.614
Test_loss: 0.980, Test macro f1: 0.443, Test micro f1: 0.479

Epoch: 2


100%|██████████| 13/13 [00:00<00:00, 53.98it/s]
100%|██████████| 4/4 [00:00<00:00, 128.77it/s]


Train loss: 0.755, Train macro f1: 0.710, Train micro f1: 0.723
Test_loss: 0.951, Test macro f1: 0.414, Test micro f1: 0.443

Epoch: 3


100%|██████████| 13/13 [00:00<00:00, 58.52it/s]
100%|██████████| 4/4 [00:00<00:00, 127.98it/s]

Saving model...
Train loss: 0.580, Train macro f1: 0.811, Train micro f1: 0.826
Test_loss: 0.874, Test macro f1: 0.506, Test micro f1: 0.574






In [189]:
# Final evaluation on test set 
model = RNN(vocab_size, embed_dim, hidden_dim, output_dim, n_layers, bidir, dropout, pad_index).to(device)
model.load_state_dict(torch.load("rnn.pt"))

evaluate_full(test_data_loader, model, device)

              precision    recall  f1-score   support

           0       0.59      0.85      0.69        20
           1       0.75      0.63      0.69        19
           2       0.57      0.31      0.40        13

    accuracy                           0.63        52
   macro avg       0.64      0.60      0.59        52
weighted avg       0.64      0.63      0.62        52

