# LAB 9: Sentiment analysis using Deep Learning

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Set-up

In [None]:
import time
from collections import Counter

import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader
from torchtext.vocab import Vocab

Connect to the GPU (training RNNs without a GPU is veeery slow)

In [None]:
device = torch.device("cuda")
torch.cuda.get_device_name(0)

Load data

In [None]:
df = pd.read_parquet("s3://ling583/sentiment.parquet", storage_options={"anon": True})

In [None]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

### Training loop

This training loop is very similar to the one we used in the previous notebook, but with small changes to work with Huggingface models. 

In [None]:
def collate_batch(batch):
    labels, texts = zip(*batch)
    (inputs,) = (
        tokenizer(list(texts), truncation=True, padding=True, return_tensors="pt"),
    )
    labels = torch.tensor([label_vocab[l] for l in labels], dtype=torch.int64)
    return labels, inputs["input_ids"], inputs["attention_mask"]

In [None]:
def decision_function(dataloader):
    model.eval()
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=True):
            results = []
            for _, input_ids, attention_mask in dataloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                outputs = model(input_ids, attention_mask).logits
                results.extend(outputs)
                # print (results)
    return results


def predict(dataloader):
    predicted = decision_function(dataloader)
    return [label_vocab.itos[p.argmax()] for p in predicted]

In [None]:
def fit(
    epochs=5,
    batch_size=64,
    wd=None,
):
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    train_dataset = list(zip(train["sentiment"], train["text"]))
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_batch,
    )

    for epoch in range(1, epochs + 1):

        start = time.time()

        model.train()
        correct = 0
        for labels, input_ids, attention_mask in tqdm(train_dataloader):
            optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=True):
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                outputs = model(
                    input_ids=input_ids, attention_mask=attention_mask, labels=labels
                )
                loss = outputs[0]
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

    elapsed = time.time() - start
    print(f"Epoch: {epoch:2d} Time: {elapsed:6.2f}s")

### Instantiate model

When using a pre-trained model, most of the training choices are already made for us, which makes things a lot easier! We'll use [this model](https://huggingface.co/distilbert-base-uncased)

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label_vocab)
).to(device)

fit(epochs=1, batch_size=16)

In [None]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

In [None]:
model.save_pretrained("distilbert.1")
tokenizer.save_pretrained("distilbert.1")

In [None]:
fit(epochs=1, batch_size=16)

In [None]:
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

In [None]:
model.save_pretrained("distilbert.2")
tokenizer.save_pretrained("distilbert.2")

In [None]:
fit(epochs=1, batch_size=16)

In [None]:
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")