<a href="https://colab.research.google.com/github/nosportugal/faast-data-science/blob/main/courses/deep_learning/unit4/solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unit 4: Optimizing Neural Nets

Your challenge in this unit will be to improve on the solution from the previous unit using a similar model architecture (fully connected layers). Trying different number of features or different optimizer and training hyperparameters is encouraged.

By now, you should have the files `labeledTrainData.tsv` and `testData.tsv` in a folder named `ldsa-dl-course-data` in your Google Drive. If you don't, please check the README file of Unit 2 for instructions.


## 1) Setup

In [None]:
!pip install lightning==2.0.1 wandb --quiet

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
import wandb

# This will open a window so you can login to W&B on Google Colab.
# If that doesn't work, set your W&B API key below
# If you do, remove your key before publishing to GitHub.

# %env WANDB_API_KEY=YOUR_WANDB_API_KEY
wandb.login()
run = wandb.init(project="imdb_sentiment")

## 2) Load the train **dataset**

Load the train dataset from the tsv files stored in your Google Drive. Split it into train and validation datasets.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(
    "/content/drive/My Drive/ldsa-dl-course-data/labeledTrainData.tsv",
    header=0,
    delimiter="\t",
    quoting=3,
)

df_shuffled = df.sample(frac=1, random_state=1).reset_index()

df_train = df_shuffled.iloc[:20000]
df_val = df_shuffled.iloc[20000:25000]

## 3) Vectorization

Use Bag-of-Words for vectorizing the dataset.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(lowercase=True, max_features=2000, stop_words="english")

cv.fit(df_train["review"])

X_train = cv.transform(df_train["review"])
X_val = cv.transform(df_val["review"])

## 4) Data loader

Create a data PyTorch `Dataset` and corresponding `DataLoader` for the train and validation datasets.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.features = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.tensor(y, dtype=torch.int64)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        return x, y

    def __len__(self):
        return self.labels.shape[0]

In [None]:
train_ds = TextDataset(X_train.todense(), df_train["sentiment"].values)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
)

In [None]:
val_ds = TextDataset(X_val.todense(), df_val["sentiment"].values)

val_loader = DataLoader(
    dataset=val_ds,
    batch_size=32,
    shuffle=True,
)

In [None]:
for batch_idx, (features, class_labels) in enumerate(train_loader):
    break

features.shape

## 5) Model definition

Define a PyTorch model and the corresponding PyTorch Lightning module.

In [None]:
class PyTorchMLP(torch.nn.Module):
    def __init__(self, num_features, hidden_layer_size=10, num_classes=2):
        super().__init__()

        self.all_layers = torch.nn.Sequential(
            # Hidden layer, with ReLU activation.
            torch.nn.Linear(num_features, hidden_layer_size),
            torch.nn.ReLU(),
            # Output layer.
            torch.nn.Linear(hidden_layer_size, num_classes),
        )

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        logits = self.all_layers(x)
        return logits


pytorch_model = PyTorchMLP(
    num_features=2000,
    hidden_layer_size=10,
    num_classes=2,
)

In [None]:
import torch.nn.functional as F
from lightning import LightningModule
import torchmetrics

In [None]:
class LightningModel(LightningModule):
    def __init__(self, model, learning_rate):
        super().__init__()
        self.save_hyperparameters()

        self.learning_rate = learning_rate
        self.model = model

        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, x):
        return self.model(x)

    def _shared_step(self, batch):
        features, true_labels = batch
        logits = self(features)

        loss = F.cross_entropy(logits, true_labels)
        predicted_labels = torch.argmax(logits, dim=1)
        return loss, true_labels, predicted_labels

    def training_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("train_loss", loss)
        self.train_acc(predicted_labels, true_labels)
        self.log(
            "train_acc", self.train_acc, prog_bar=True, on_epoch=True, on_step=False
        )
        return loss

    def validation_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)

        self.log("val_loss", loss, prog_bar=True)
        self.val_acc(predicted_labels, true_labels)
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        loss, true_labels, predicted_labels = self._shared_step(batch)
        self.test_acc(predicted_labels, true_labels)
        self.log("test_acc", self.test_acc)

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate)

        return optimizer

## 6) Model training

Train your model using a Lightning trainer.

In [None]:
from lightning import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import WandbLogger

In [None]:
lightning_model = LightningModel(model=pytorch_model, learning_rate=0.001)

callbacks = [
    ModelCheckpoint(save_top_k=1, mode="max", monitor="val_acc", save_last=True)
]

wandb_logger = WandbLogger(
    project="imdb_sentiment",
    log_model="all",
    group="unit4",
)

wandb_logger.log_hyperparams(
    {
        "optimizer": lightning_model.configure_optimizers().__class__.__name__,
        "architecture": str(lightning_model.model.all_layers),
    }
)

trainer = Trainer(
    callbacks=callbacks,
    max_epochs=30,
    accelerator="auto",
    logger=wandb_logger,
    deterministic=True,
)

trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

wandb.finish()

## 7) Inference

Load the test dataset from the tsv file stored in your Google Drive and the model from the checkpoints you created on W&B. Finally, perform inference with the model on the test dataset.

In [None]:
df_test = pd.read_csv(
    "/content/drive/My Drive/ldsa-dl-course-data/testData.tsv",
    header=0,
    delimiter="\t",
    quoting=3,
)

X_test = cv.transform(df_test["review"])

In [None]:
class InferenceTextDataset(Dataset):
    def __init__(self, X):
        self.features = torch.tensor(X, dtype=torch.float32)

    def __getitem__(self, index):
        return self.features[index]

    def __len__(self):
        return self.features.shape[0]

In [None]:
test_ds = InferenceTextDataset(X_test.todense())

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=32,
    shuffle=False,
)

In [None]:
# Define checkpoint reference.
checkpoint_reference = "[USERNAME]/imdb_sentiment/model-[MODEL_ID]:best"

# Download checkpoint locally (if not already cached).
artifact = run.use_artifact(checkpoint_reference, type="model")
artifact_dir = artifact.download()

# Load checkpoint.
model = LightningModel.load_from_checkpoint(str(artifact_dir) + "/model.ckpt")

In [None]:
batch_outputs = trainer.predict(model=model, dataloaders=test_loader)
logits = torch.cat(batch_outputs)
predicted_labels = torch.argmax(logits, dim=1)

In [None]:
wandb.finish()

## 8) Post-process for Kaggle submission

Assuming the predicted class labels are stored in `predicted_labels` (as a Torch tensor), create a csv file ready for submission on Kaggle.

In [None]:
output = pd.DataFrame(data={"id": df_test["id"], "sentiment": predicted_labels})

In [None]:
output.to_csv("output.csv", index=False, quoting=3)