# LAB 9: Sentiment analysis using Deep Learning

In [None]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Set-up

In [None]:
import time
from collections import Counter

import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader
from torchtext.vocab import Vocab

Connect to the GPU (training RNNs without a GPU is veeery slow)

In [None]:
device = torch.device("cuda")
torch.cuda.get_device_name(0)

Load data

In [None]:
df = pd.read_parquet("s3://ling583/sentiment.parquet", storage_options={"anon": True})

In [None]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

### Training loop

This training loop is very similar to the one we used in the previous notebook, but with small changes to work with Huggingface models. 

In [None]:
def collate_batch(batch):
    labels, texts = zip(*batch)
    inputs,  = tokenizer(list(texts), truncation=True, padding=True, return_tensors='pt'),
    labels = torch.tensor([label_vocab[l] for l in labels], dtype=torch.int64)
    return labels, inputs['input_ids'], inputs['attention_mask']

In [None]:
def decision_function(dataloader):
    model.eval()
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=True):
            results = [ ]
            for _, input_ids, attention_mask in dataloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                outputs = model(input_ids, attention_mask).logits
                results.extend(outputs)
    return results

def predict(dataloader):
    predicted = decision_function(dataloader)
    return [label_vocab.itos[p.argmax()] for p in predicted]

### Instantiate model

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert.2')
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert.2", num_labels=len(label_vocab)
).to(device)

In [None]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_batch
)
test_predicted = predict(tqdm(test_dataloader))
acc = 100 * accuracy_score(test['sentiment'], test_predicted)
f1 = 100 * f1_score(test['sentiment'], test_predicted, average='macro')
print(f'Accuracy = {acc:.3f} F1 = {f1:.3f}')

----

In [None]:
from lime.lime_text import LimeTextExplainer

In [None]:
explainer = LimeTextExplainer(class_names=label_vocab.itos)

In [None]:
def decision(texts):
    dataset = [("good", t) for t in texts]
    dataloader = DataLoader(
        dataset, batch_size=4, shuffle=False, collate_fn=collate_batch
    )
    return torch.vstack(decision_function(dataloader)).to("cpu").numpy()

def explain(text):
    exp = explainer.explain_instance(text, decision, num_features=10, labels=[0,1])
    exp.show_in_notebook()

In [None]:
explain(test["text"].iloc[0])

In [None]:
test[test_predicted==test["sentiment"]].head()