# LAB 9: Sentiment analysis using Deep Learning

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

### Set-up

In [2]:
import time
from collections import Counter

import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import Vocab, build_vocab_from_iterator

Connect to the GPU (training RNNs without a GPU is veeery slow)

In [3]:
device = torch.device("cuda")
torch.cuda.get_device_name(0)

'NVIDIA Tesla T4'

Load data

In [4]:
df = pd.read_parquet("s3://ling583/sentiment.parquet", storage_options={"anon": True})

In [5]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

### Training loop

Because every problem and every model is a little bit different, pytorch (unlike scikit-learn) doesn't have built-in `fit` and `predict` methods. We need to define them ourselves here.

This function gathers up a batch of training examples, encodes them, and sends them to the GPU for processing.

In [6]:
# Takes a bunch of reviews, processes them in a way that torch can work with, then transfers
# to the GPU for high speed computation.

# Takes however many samples are to be in a mini-batch, packages them up as a tensor, 
# then sends them off to the GPU.
def collate_batch(batch):
    labels, texts = zip(*batch)
    texts = [
        [vocab[token] for token in ["<s>"] + tokenize(t) + ["</s>"]] for t in texts
    ]
    texts = [torch.tensor(t, dtype=torch.int64) for t in texts]
    texts = pad_sequence(texts, padding_value=vocab["<pad>"])
    labels = torch.tensor([label_vocab[l] for l in labels], dtype=torch.int64)
    return labels.to(device), texts.to(device)

This one applies the model to some test data, for evaluation. 

In [7]:
# copies of the decision and predict functions in scikit learn
def decision_function(dataloader):
    model.eval()
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=True):
            results = []
            for _, text in dataloader:
                results.extend(model(text))
    return results


def predict(dataloader):
    predicted = decision_function(dataloader)
    return [label_vocab.itos[p.argmax()] for p in predicted]

And this is the important part: the function that actually trains the model! 

In [8]:
# Takes the training data, splits it into training and validation data, 
# sets them up for torch. 1 epoch is 1 pass over the training data.
# During an epoch, apply the model to each batch, compute the error signal, and adjust the model to improve.
# Finally print out the results

def fit(epochs=5, batch_size=64, wd=None, clip=None):
    criterion = torch.nn.CrossEntropyLoss()
    if wd:
        optimizer = torch.optim.AdamW(model.parameters(), weight_decay=wd)
    else:
        optimizer = torch.optim.Adam(model.parameters())
    scaler = torch.cuda.amp.GradScaler(enabled=True)

    t, v = train_test_split(train, test_size=0.1, stratify=train["sentiment"])
    train_dataset = list(zip(t["sentiment"], t["text"]))
    valid_dataset = list(zip(v["sentiment"], v["text"]))

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_batch,
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_batch,
    )

    for epoch in range(1, epochs + 1):

        start = time.time()

        model.train()
        correct = 0
        n = 0
        for label, text in tqdm(train_dataloader):
            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=True):
                predicted = model(text)
                loss = criterion(predicted, label)
                correct += (predicted.argmax(1) == label).sum().item()
                n += len(label)
            scaler.scale(loss).backward()
            if clip:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            scaler.step(optimizer)
            scaler.update()

        elapsed = time.time() - start
        train_acc = correct / n * 100.0
        valid_pred = predict(valid_dataloader)
        valid_acc = accuracy_score(v["sentiment"], valid_pred) * 100.0

        print(
            f"Epoch: {epoch:2d} Time: {elapsed:6.3f}s "
            f"Train acc: {train_acc:5.3f} Valid acc: {valid_acc:5.3f}"
        )

----

### Define the model class

Okay, most of what's above is mostly [boilerplate](https://en.wikipedia.org/wiki/Boilerplate_code). Now we'll define the specific model and hyperparameter settings that we're using for this task.

First, the model architecture. This is a basic [RNN](https://en.wikipedia.org/wiki/Recurrent_neural_network), using [GRU](https://en.wikipedia.org/wiki/Gated_recurrent_unit)s are the recurrent units.

The hyperparameters of interest are:

* `hidden_size` - Number of neurons in the hidden layer
* `embedding_size` - Number of dimensions of hte lexical representations
* `hidden_layers` - The number of layers in the model
* `bidirectional` - If True, each hidden layer is actually 2 layers, one left-to-right and one right-to-left
* `dropout` - On each update, disables some of the dimensions to make the model randomly break to force the training to work around it. Helps to prevent memorizations, but makes it hard to learn.

Setting these higher makes the model more powerful and easier to train, but if you set them too high, it will remember the training data and just recall what the correct classification was while not being able to classify new texts.

They control the ability of the model to learn details. Higher values for the first 3, plus setting `bidirectional` to `True`, increase the representational power of the model. That means it can learn more complex patterns and learn them more quickly. If these values are set too high, though, then the model can learn *too* well--it will simply memorize the training data and you'll get overfitting. The last value, dropout, helps control that. Higher values for `dropout` reduce the model's ability to learn and slow down training. The trick is finding a balance among all these settings that maximize learning while minimizing overfitting, which is unfortunately not easy to achieve.

In [9]:
# Sets up the model
class TextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab,
        num_class,
        hidden_size,
        embedding_dim=128,
        hidden_layers=1,
        dropout=0.0,
        bidirectional=True,
    ):
        super(TextClassificationModel, self).__init__()
        if not vocab.vectors is None:
            self.embedding = nn.Embedding.from_pretrained(
                vocab.vectors, freeze=True, padding_idx=vocab["<pad>"]
            )
        else:
            self.embedding = nn.Embedding(
                len(vocab), embedding_dim, padding_idx=vocab["<pad>"]
            )

        self.rnn = nn.GRU(
            input_size=self.embedding.embedding_dim,
            hidden_size=hidden_size,
            num_layers=hidden_layers,
            bidirectional=bidirectional,
            dropout=dropout,
        )
        self.dropout = nn.Dropout(dropout)
        out_size = hidden_size * hidden_layers
        if bidirectional:
            out_size = out_size * 2
        self.fc = nn.Linear(out_size, num_class)

    # applies the model
    def forward(self, text, lengths=None):
        embedded = self.embedding(text)
        _, hidden = self.rnn(embedded)
        out = torch.cat(torch.unbind(hidden), axis=1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

Next we set up the vocabulary (this is the step performed by `CountVectorizer` in scikit-learn) using a [basic tokenizer](https://pytorch.org/text/stable/data_utils.html#get-tokenizer) that comes with pytorch. 

The "specials" are vocabulary items that don't correspond to words but are used internally by the model:

* `<pad>` : For implementation reasons the documents in a batch all have to be the same length, so we add copies of the pseudo-word `<pad>` to the end of shorter reviews to make them as long as the longest one. 
* `<s>`, `</s>` : These mark the beginning and end of the reviews.
* `<unk>` : Unknown words (i.e., words which are used in the test data that didn't get seen in the training data) get replaced with `<unk>`

There's one adjustable parameter here: raising the value of `min_freq` removes low frequency lexical items (similar to `min_df` in scikit-learn). Increasing it usually doesn't improve the model.

In [10]:
tokenize = get_tokenizer("basic_english")
counter = Counter(concat(map(tokenize, tqdm(train["text"]))))
vocab = Vocab(
    counter,
    min_freq=1, # Change to limit the words brought into the vocab. Set as 1 brings in everything
    specials=("<pad>", "<s>", "</s>", "<unk>"),
)
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

  0%|          | 0/40000 [00:00<?, ?it/s]

Now we instantiate the model:

In [11]:
# instantiate the model and send it off to the GPU
# generally start with a model that overfits
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256, # this is a high value for this parameter
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.0,
    bidirectional=True,
).to(device) 

And finally, we train! There are three important settings here:

* `epochs` : This is the number of passes over the training data that we make when fitting the model. A crude way to avoid overfitting is to reduce this, which stops before training before the model has converged.
* `batch_size` : This is the number of reviews that get processed at once during training. In general, increasing `batch_size` makes the program run faster (since it lets us take better advantage of the GPU) but may require more epochs to converge. Setting `batch_size` too high can overload the GPUs memory and lead to a crash. The effects of changing `batch_size` on the final results are hard to predict, but it can make a big difference.
* `wd` : This is the "[weight decay](https://www.fast.ai/2018/07/02/adam-weight-decay/)" parameter. Setting this to a value other than `None` regularizes the model and can reduce overfitting (similar to setting `alpha` for `SGDClassifier`).

In [13]:
fit(epochs=5, batch_size=64, wd=None)
# We can see over time that this model overfits. This is represented by the fact that the training accuracy
# continues to go up while the validation accuracy eventually plateaus and then decreases. 
# This shows that the model is slowly memorizing the training data.

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 69.758s Train acc: 97.608 Valid acc: 97.700


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 68.997s Train acc: 99.097 Valid acc: 97.425


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 70.270s Train acc: 98.367 Valid acc: 96.475


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 69.193s Train acc: 99.608 Valid acc: 96.650


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  5 Time: 70.146s Train acc: 99.611 Valid acc: 96.525


In [14]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 88.670 F1 = 85.883


## Try changing some parameters 

In [15]:
# add a value for dropout
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.5, # after each pass, half of the model receives adjustments
    bidirectional=True,
).to(device) 

In [16]:
fit(epochs=5, batch_size=64, wd=None)
# There is still some overfitting, but its not as bad
# Note that the highest validation accuracy is at 4 epochs

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 85.304s Train acc: 78.564 Valid acc: 86.875


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 84.621s Train acc: 89.447 Valid acc: 88.950


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 86.635s Train acc: 92.192 Valid acc: 89.500


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 86.045s Train acc: 94.144 Valid acc: 90.250


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  5 Time: 85.360s Train acc: 96.058 Valid acc: 90.175


In [17]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 90.010 F1 = 87.360


In [18]:
# Reduce the epochs to 4
# Note the very high training accuracy. This is because we did not reset the model, so instead of running 
# just 4 epochs on a new model, we ran 4 additional epochs on top of the first % 5, overfitting even more.
fit(epochs=4, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 84.074s Train acc: 96.542 Valid acc: 95.825


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 86.187s Train acc: 97.514 Valid acc: 95.725


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 86.086s Train acc: 97.978 Valid acc: 95.875


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 86.479s Train acc: 98.569 Valid acc: 95.600


In [19]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

Accuracy = 89.950 F1 = 86.707


In [20]:
# Reset the model again
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128,
    hidden_layers=2,
    dropout=0.5, # after each pass, half of the model receives adjustments
    bidirectional=True,
).to(device) 

In [21]:
# Run for 4 epochs from scratch 
fit(epochs=4, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 84.085s Train acc: 78.200 Valid acc: 88.750


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 85.602s Train acc: 89.669 Valid acc: 90.750


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 85.711s Train acc: 92.153 Valid acc: 91.000


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 85.805s Train acc: 93.992 Valid acc: 90.600


In [22]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}") 

Accuracy = 90.490 F1 = 87.328


In [None]:
# reminder that this is our vocab, just copied from above
# Go ahead and reinitialize it
tokenize = get_tokenizer("basic_english")
counter = Counter(concat(map(tokenize, tqdm(train["text"]))))
vocab = Vocab(
    counter,
    min_freq=1, # Change to limit the words brought into the vocab. Set as 1 brings in everything
    specials=("<pad>", "<s>", "</s>", "<unk>"),
)
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

In [23]:
# Modify the vocab with preestablished word vectors
vocab.load_vectors("glove.6B.200d", unk_init = lambda t: torch.nn.init.uniform_(t, -1.0, 1.0))

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                               
100%|█████████▉| 399999/400000 [00:21<00:00, 18979.12it/s]


In [27]:
# Reset the model again
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128, # this will be ignored and will instead use dimensions from the modified vocab
    hidden_layers=2,
    dropout=0.5, # after each pass, half of the model receives adjustments
    bidirectional=True,
).to(device) 

In [28]:
# Run for 4 epochs from scratch 
fit(epochs=5, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  1 Time: 84.296s Train acc: 81.661 Valid acc: 88.425


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  2 Time: 84.562s Train acc: 88.453 Valid acc: 89.750


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  3 Time: 85.100s Train acc: 90.000 Valid acc: 90.150


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  4 Time: 84.386s Train acc: 91.289 Valid acc: 89.825


  0%|          | 0/563 [00:00<?, ?it/s]

Epoch:  5 Time: 86.261s Train acc: 92.267 Valid acc: 89.400


In [29]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}") 

Accuracy = 89.950 F1 = 87.278


## Try to get accuracy above 91

## Changelog (reordered from best to worst by accuracy):
Parameters listed in order of hidden_size, embedding_dim, hidden_layers, dropout, bidirectional, epochs

hidden_size=256, embedding_dim=128, hidden_layers=2, **dropout=0.25**, bidirectional=True, epochs=5  
Accuracy = 90.750 F1 = 87.642

hidden_size=256, embedding_dim=128, **hidden_layers=3**, dropout=0.25, bidirectional=True, epochs=5  
Accuracy = 90.470 F1 = 87.716

hidden_size=256, embedding_dim=128, hidden_layers=2, **dropout=0.3**, bidirectional=True, epochs=6  
Accuracy = 90.310 F1 = 87.626

hidden_size=256, embedding_dim=128, hidden_layers=3, **dropout=0.6**, bidirectional=True, epochs=5  
Accuracy = 90.290 F1 = 87.656

hidden_size=256, embedding_dim=128, **hidden_layers=2**, **dropout=0.6**, bidirectional=True, epochs=5  
Accuracy = 90.280 F1 = 87.671

hidden_size=256, embedding_dim=128, hidden_layers=2, **dropout=0.2**, bidirectional=True, epochs=5  
Accuracy = 90.200 F1 = 87.019

hidden_size=256, embedding_dim=128, hidden_layers=2, **dropout=0.75**, bidirectional=True, epochs=5  
Accuracy = 89.920 F1 = 85.974

hidden_size=256, embedding_dim=128, hidden_layers=2, dropout=0.25, bidirectional=True, **epochs=6**  
Accuracy = 89.500 F1 = 85.966

#### Reset vocab and introduce the new dimensions

hidden_size=256, embedding_dim=128, **hidden_layers=3**, **dropout=0.5**, bidirectional=True, epochs=5  
Accuracy = 90.190 F1 = 87.518

hidden_size=256, embedding_dim=128, hidden_layers=2, dropout=0.5, bidirectional=True, epochs=5  
Accuracy = 89.950 F1 = 87.278
    
hidden_size=256, embedding_dim=128, hidden_layers=2, **dropout=0.25**, bidirectional=True, epochs=5  
Accuracy = 89.560 F1 = 86.031

hidden_size=256, embedding_dim=128, hidden_layers=3, **dropout=0.6**, bidirectional=True, epochs=5  
Accuracy = 87.220 F1 = 84.941



In [80]:
# Reset the model again
model = TextClassificationModel(
    vocab,
    len(label_vocab),
    hidden_size=256,
    embedding_dim=128, # this will be ignored and will instead use dimensions from the modified vocab
    hidden_layers=2,
    dropout=0.5, # after each pass, half of the model receives adjustments
    bidirectional=True,
).to(device) 

RuntimeError: CUDA error: an illegal memory access was encountered

In [74]:
fit(epochs=5, batch_size=64, wd=None)

  0%|          | 0/563 [00:00<?, ?it/s]

RuntimeError: CUDA error: an illegal memory access was encountered

In [None]:
test_dataset = list(zip(test["sentiment"], test["text"]))
test_dataloader = DataLoader(
    test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)
test_predicted = predict(test_dataloader)
acc = 100 * accuracy_score(test["sentiment"], test_predicted)
f1 = 100 * f1_score(test["sentiment"], test_predicted, average="macro")
print(f"Accuracy = {acc:.3f} F1 = {f1:.3f}")

In [78]:
# reminder that this is our vocab, just copied from above
# Go ahead and reinitialize it
tokenize = get_tokenizer("basic_english")
counter = Counter(concat(map(tokenize, tqdm(train["text"]))))
vocab = Vocab(
    counter,
    min_freq=1, # Change to limit the words brought into the vocab. Set as 1 brings in everything
    specials=("<pad>", "<s>", "</s>", "<unk>"),
)
label_vocab = Vocab(Counter(train["sentiment"]), specials=[])

  0%|          | 0/40000 [00:00<?, ?it/s]

In [50]:
# Modify the vocab with preestablished word vectors
vocab.load_vectors("glove.6B.200d", unk_init = lambda t: torch.nn.init.uniform_(t, -1.0, 1.0))

# make sure to shut down this kernel before running distilbert, otherwise will run out of memory