# Experiment with regression

In [10]:
from transformers import BertTokenizer

tokeniser = BertTokenizer.from_pretrained("bert-tiny", local_files_only=True)

In [11]:
# Your answer here
import torch
from typing import Dict, Any, Tuple
from torch import Tensor
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import BertTokenizer
import pandas as pd

# create a dataset class that inherits from torch.Dataset


class EssayDataset(Dataset):
    """
    Dataset class for handing training and test data.
    TODO: Generalise to handle validation split as well.


    """

    def __init__(
        self,
        df: pd.DataFrame,
        tokeniser: BertTokenizer,
        target_label=None,
        max_length: int = 512,
    ):
        """
        Args:
        df (pd.DataFrame): Dataframe of train or test dataset.
        tokeniser (BertTokenizer): Pre-trained tokenizer
        target_label (string) : The label corresponding to our scores.
        max_length(int, optional):
        """
        self.data = df
        self.tokenizer = tokeniser
        self.max_length = max_length
        self.target_label = target_label

    def __len__(self) -> int:
        """
        Returns the number of samples.
        """
        return len(self.data)

    def __getitem__(self, index: int) -> Dict[str, Tensor]:
        """
        Tokenises the essay and returns input or input+label depending on
        whether self.target_label is None.

        Returns:
            data_dict (Dict[str, Tensor]): Dictionary of tokenised essay, attention_mask and score

        """
        essay = str(self.data.iloc[index]["content"])  # get essay text
        # score = int(self.data.iloc[index]["score"]) - 1 # convert to 0-based index

        encoding = self.tokenizer(
            essay,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        data_dict = {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }

        # this generalises the EssayDataset class to work with test data as well
        # we also need to check that the target_label is indeed "score"
        # obviously this could be generalised further.
        if self.target_label is not None:
            # convert to 0-based index
            data_dict["score"] = torch.tensor(int(self.data.iloc[index]["score"]) - 1)

        return data_dict


def split_essay_data(full_dataset) -> Tuple[Dataset, Dataset]:
    """
    Split essay data for to check performance on held out validation set.

    Args:
        full_dataset (Dataset): The full dataset that we intend to split.

    Returns:
        train_dataset (Dataset)
        val_dataset (Dataset)

    """
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size

    # set manual seed to ensure reproducibility
    # torch.manual_seed(42)
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
    return train_dataset, val_dataset

In [12]:
train_df = pd.read_csv("data/cleaned_dataset.csv")

full_dataset = EssayDataset(train_df, tokeniser, target_label="score")

train_dataset, val_dataset = split_essay_data(full_dataset)

# from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
#
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [24]:
from torch import Tensor
from torch.nn import CrossEntropyLoss, Module
from torch.utils.data import DataLoader
from torch.optim import Optimizer, AdamW
from numpy import mean
import torch.nn.functional as F


def get_accuracy(preds, labels):
    """
    TODO: docstrings
    """
    correct = (preds == labels).sum().item()
    total = labels.size(0)
    return correct / total


def get_rmse(outputs, labels):
    """
    TODO: docstrings
    """
    return torch.sqrt(F.mse_loss(outputs, labels)).item()


def evaluate_model(
    model: Module, val_loader: DataLoader, loss_fn: CrossEntropyLoss
) -> Dict[str, float]:
    """
    TODO: docstrings
    """
    # Put model in evaluation mode. So we do not drink statistics from validation set. (Take statistics from training)
    model.eval()

    # add to metrics dictionary
    metrics = {"loss": [], "rmse": []}

    # no need to store gradients
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"]

            attention_mask = batch["attention_mask"]

            labels = batch["score"]

            outputs = model(input_ids, attention_mask=attention_mask)

            loss = loss_fn(outputs.logits, labels)

            # add to metrics dictionary
            metrics["loss"].append(loss.item())
            metrics["rmse"].append(get_rmse(outputs, labels))

    return {key: mean(val) for key, val in metrics.items()}


# could also use BCELoss - convert to one-hot encoding?
def train_model(
    model: Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    optimiser: Optimizer,
    loss_fn: CrossEntropyLoss,
    epochs: int = 1,
) -> Module:
    """
    TODO: Improve docstrings. This is just a skeleton.

    Train a model on essay text with scores as labels.

    Args:
        model (Module) :
        train_loader (DataLoader) :
        val_loader (DataLoader) : (UNUSED) - this would be used here if we trained for multiple epochs.
        optimiser (Optimiser) :
        loss_fn (CrossEntropyLoss) :
        epochs (int) :

    Returns:
        Trained model

    """
    # Prepare model for training. This turns certain layers on and tells batch
    # norm layers to use incoming statistics and let the contribute to their memory.
    model.train()

    metrics = {"loss": [], "rmse": []}

    # for illustration only (we only train on one epoch).
    for epoch in range(epochs):
        for iter, batch in enumerate(train_loader):
            # zero the gradients (otherwise gradients accumulate)
            optimiser.zero_grad()

            input_ids = batch["input_ids"]
            # the attention mask tells us which tokens are real words and which are padding.

            attention_mask = batch["attention_mask"]
            labels = batch["score"].float()

            outputs = model(input_ids, attention_mask=attention_mask)

            # compute loss
            # outputs.logits is the raw output of the model. It has shape (batch_size, num_labels)
            # labels is the true label of the data. It has shape (batch_size)
            # internally CE loss does the following:
            # loss = -log(softmax(logits)[label])
            loss = loss_fn(outputs.logits.squeeze(-1), labels)

            # compute gradients
            loss.backward()

            # nudge parameters in direction of steepest descent
            optimiser.step()

            # loss is tensor - get value using item()
            metrics["loss"].append(loss.item())
            metrics["rmse"].append(get_rmse(outputs.logits.squeeze(-1), labels))

            # debugging
            # print(iter, metrics["accuracy"][-1], loss.item())
            # visualise every 10 steps (10 * batch_size) = 160 samples per print.
            if iter % 10 == 0:
                print("step: ", iter, ", batch loss: ", loss.item())

    # just output train averages for now - we'll get validation loss / accuracy next.
    print("Train averages ", {key: mean(val) for key, val in metrics.items()})

    # return fine-tuned model
    return model

In [20]:
from transformers import BertModel, BertConfig, BertForSequenceClassification
import torch.nn as nn


# class BertForEssayRegression(nn.Module):
#     def __init__(self, pretrained_model_name="bert-tiny"):
#         super().__init__()
#         self.bert = BertModel.from_pretrained(
#             pretrained_model_name, local_files_only=True
#         )
#         self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         # Take the pooled output (CLS token)
#         cls_output = outputs.pooler_output  # shape: (batch_size, hidden_dim)
#         score = self.regressor(cls_output).squeeze(-1)  # shape: (batch_size,)
#         return score

model = BertForSequenceClassification.from_pretrained(
    "bert-tiny",
    num_labels=1,
    problem_type="regression",  # explicitly tells it it's regression
    local_files_only=True,
)

Some weights of the model checkpoint at bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from

In [25]:
# we treat this as a very simple classification task - even though we could potentially make use of ordinality in loss function.
# model = BertForEssayRegression()
optimiser: Optimizer = AdamW(model.parameters(), lr=1e-4)
# loss = loss_fn(outputs, labels)
loss_fn = nn.MSELoss()

model = train_model(model, train_loader, val_loader, optimiser, loss_fn, epochs=1)
# model.save_pretrained("bert_tiny_finetuned")
torch.save(model.state_dict(), "model-trained-regression.pth")

# torch.save(model.state_dict(), "bert_tiny_finetuned.pth")

step:  0 , batch loss:  4.303873062133789
step:  10 , batch loss:  3.8187777996063232
step:  20 , batch loss:  2.064824342727661
step:  30 , batch loss:  1.2988307476043701
step:  40 , batch loss:  0.8945863842964172
step:  50 , batch loss:  1.1415860652923584
step:  60 , batch loss:  0.7110911011695862
step:  70 , batch loss:  0.6081348061561584
step:  80 , batch loss:  0.5337756872177124
Train averages  {'loss': 1.5377885425506637, 'rmse': 1.1720104085844616}


In [None]:
device = "cpu"
with torch.no_grad():
    model.eval()
    all_preds = []
    all_labels = []

    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["score"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.round(outputs)  # <--- ROUND TO NEAREST INTEGER

        all_preds.append(preds.cpu())
        all_labels.append(labels.cpu())

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    accuracy = (all_preds == all_labels).float().mean().item()
    print(f"Rounded accuracy on val set: {accuracy:.4f}")

Rounded accuracy on val set: 0.6366
