<a href="https://colab.research.google.com/github/nosportugal/faast-data-science/blob/main/courses/deep_learning/unit9/solution_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unit 9: Pre-trained Models

By now, you should have the files `labeledTrainData.tsv` and `testData.tsv` in a folder named `ldsa-dl-course-data` in you Google Drive. If you don't, please check the README file of Unit 2 for instructions.

Your chalenge in this unit will be to classify the sentiment of IMDb movie reviews using a fine tuned pre-trained model accessed from the Hugging Face platform.

We recommend you to use W&B to track your experiments. Sign up with your google account so that connection with the Google Colab environment is seamless.

## 1) Setup & Installs

In [None]:
! pip install lightning==2.0.1 wandb datasets transformers --quiet

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
import wandb

# This will open a window so you can login to W&B on Google Colab.
# If that doesn't work, set your W&B API key below
# If you do, remove your key before publishing to GitHub.

# %env WANDB_API_KEY=YOUR_WANDB_API_KEY
wandb.login()
run = wandb.init(project="imdb_sentiment")

## 2) Load the **dataset**

Load the train dataset from the tsv files stored in your Google Drive. Split it into train and validation datasets.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(
    "/content/drive/My Drive/ldsa-dl-course-data/labeledTrainData.tsv",
    header=0,
    delimiter="\t",
    quoting=3,
)

df_shuffled = df.sample(frac=1, random_state=1).reset_index()

df_train = df_shuffled.iloc[:20000]
df_val = df_shuffled.iloc[20000:25000]

In [None]:
df_test = pd.read_csv(
    "/content/drive/My Drive/ldsa-dl-course-data/testData.tsv",
    header=0,
    delimiter="\t",
    quoting=3,
)

In [None]:
from datasets import Dataset, DatasetDict

full_dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(df_train),
        "validation": Dataset.from_pandas(df_val),
        "test": Dataset.from_pandas(df_test),
    }
)

In [None]:
full_dataset

## 3) Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased", model_max_length=512
)

print("Tokenizer input max. length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

In [None]:
full_dataset_tokenized = full_dataset.map(
    lambda batch: tokenizer(batch["review"], truncation=True, padding=True),
    batched=True,
    batch_size=None,
)

In [None]:
full_dataset_tokenized

In [None]:
for feature in ["input_ids", "attention_mask", "sentiment"]:
    print(full_dataset_tokenized["train"][0][feature])

In [None]:
full_dataset_tokenized["train"].set_format(
    "torch", columns=["input_ids", "attention_mask", "sentiment"]
)
full_dataset_tokenized["validation"].set_format(
    "torch", columns=["input_ids", "attention_mask", "sentiment"]
)
full_dataset_tokenized["test"].set_format(
    "torch", columns=["input_ids", "attention_mask"]
)

In [None]:
for feature in ["input_ids", "attention_mask", "sentiment"]:
    print(full_dataset_tokenized["train"][0][feature])

## 4) Data loader

Create a data PyTorch `Dataset` and corresponding `DataLoader` for the train and validation datasets.

In [None]:
import lightning as L
import torch

In [None]:
from torch.utils.data import Dataset, DataLoader


class TextDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        return self.dataset[index]

    def __len__(self):
        return self.dataset.num_rows

In [None]:
train_ds = TextDataset(full_dataset_tokenized["train"])

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
)

In [None]:
val_ds = TextDataset(full_dataset_tokenized["validation"])

val_loader = DataLoader(
    dataset=val_ds,
    batch_size=32,
    shuffle=True,
)

In [None]:
for batch_idx, batch in enumerate(train_loader):
    break

In [None]:
print(batch)
print(batch["input_ids"].shape)

## 5) Model definition

Define a PyTorch model and the corresponding PyTorch Lightning module.

In [None]:
from transformers import AutoModelForSequenceClassification

pytorch_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

In [None]:
import torch.nn.functional as F
import torchmetrics

In [None]:
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate):
        super().__init__()
        self.save_hyperparameters()

        self.learning_rate = learning_rate
        self.model = model

        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, batch):
        return self.model(
            batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )

    def _shared_step(self, batch):
        labels = batch["sentiment"]

        outputs = self(batch)
        logits = outputs["logits"]

        loss = F.cross_entropy(logits, labels)
        predicted_labels = torch.argmax(logits, dim=1)
        return loss, labels, predicted_labels

    def training_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("train_loss", loss)
        self.train_acc(predicted_labels, true_labels)
        self.log(
            "train_acc", self.train_acc, prog_bar=True, on_epoch=True, on_step=False
        )
        return loss

    def validation_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("val_loss", loss, prog_bar=True)
        self.val_acc(predicted_labels, true_labels)
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)
        self.test_acc(predicted_labels, true_labels)
        self.log("test_acc", self.test_acc)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

## 6) Model training

Train your model using a Lightning trainer.

In [None]:
from lightning import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import WandbLogger

In [None]:
lightning_model = LightningModel(model=pytorch_model, learning_rate=5e-5)

callbacks = [
    ModelCheckpoint(save_top_k=1, mode="max", monitor="val_acc", save_last=True)
]

wandb_logger = WandbLogger(
    project="imdb_sentiment",
    log_model="all",
    group="unit9",
)

trainer = Trainer(
    callbacks=callbacks,
    max_epochs=3,
    accelerator="auto",
    logger=wandb_logger,
    deterministic=True,
)

trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

## 7) Inference

Load the test dataset from the tsv file stored in your Google Drive and the model from the checkpoints you created on W&B. Finally, perform inference with the model on the test dataset.

In [None]:
test_ds = TextDataset(full_dataset_tokenized["test"])

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=32,
    shuffle=False,
)

In [None]:
# Define checkpoint reference.
checkpoint_reference = "[USERNAME]/imdb_sentiment/model-[MODEL_ID]:best"

# Download checkpoint locally (if not already cached).
artifact = run.use_artifact(checkpoint_reference, type="model")
artifact_dir = artifact.download()

# Load checkpoint.
model = LightningModel.load_from_checkpoint(str(artifact_dir) + "/model.ckpt")

In [None]:
batch_outputs = trainer.predict(model=model, dataloaders=test_loader)
logits = torch.cat([batch_output["logits"] for batch_output in batch_outputs])
predicted_labels = torch.argmax(logits, dim=1)

In [None]:
wandb.finish()

## 8) Post-process for Kaggle submission

Assuming the predicted class labels are stored in `predicted_labels` (as a Torch tensor), create a csv file ready for submission on Kaggle.

In [None]:
output = pd.DataFrame(data={"id": df_test["id"], "sentiment": predicted_labels})

In [None]:
output.to_csv("output.csv", index=False, quoting=3)