# Building a sentiment classifier

Using the sentence polarity dataset: http://www.cs.cornell.edu/people/pabo/movie-review-data/

TODO: explain in more detail

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
print(torch.__version__)

1.10.0


In [2]:
#!pip install torchtext==0.11.0

In [3]:
### load our data
file_location_neg = "./rt-polaritydata/rt-polaritydata/rt-polarity.neg"
file_location_pos = "./rt-polaritydata/rt-polaritydata/rt-polarity.pos"
with open(file_location_neg, 'rb') as f:
    lines_neg = f.readlines()
with open(file_location_pos, 'rb') as f:
    lines_pos = f.readlines()
    
lines_neg = [x.decode("utf-8", "ignore") for x in lines_neg]
lines_pos = [x.decode("utf-8", "ignore") for x in lines_pos]

labels_neg = [0] * len(lines_neg)
labels_pos = [1] * len(lines_pos)

lines = lines_neg + lines_pos
labels = labels_neg + labels_pos
print(len(lines))
print(len(labels))

10662
10662


In [4]:
## create train-test split with shuffled data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(lines, labels, test_size=0.2, random_state=42, shuffle=True)
print(len(X_train))
print(len(y_test))

8529
2133


In [5]:
### take pipeline from https://torchtutorialstaging.z5.web.core.windows.net/beginner/text_sentiment_ngrams_tutorial.html
from torchtext.data.utils import get_tokenizer
from collections import Counter
#from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
len_counter = 0
for line in lines:
    toks = tokenizer(line)
    if len(toks) > len_counter:
        len_counter = len(toks)
    counter.update(toks)
# folllowing line from https://stackoverflow.com/a/15862037/10526100
# removes all words with frequency of one from counter
counter_new = Counter({k: c for k, c in counter.items() if c > 1})
counter_new.update(["[PAD]", "[UNK]"])
ids = list(range(len(counter_new)))
vocab_dict = dict(zip(counter_new.keys(), ids))
#vocab = Vocab(counter)
#print(vocab.vocab)
print([vocab_dict[token] for token in ['here', 'is', 'an', 'example']])
print(vocab_dict["[PAD]"])
print(len_counter)

[188, 55, 81, 4949]
10160
62


In [6]:
# copied from tutorial, added padding
def text_pipeline(x, max_len):
    vocab_list = [vocab_dict[token] if token in vocab_dict else vocab_dict["[UNK]"] for token in tokenizer(x) ]
    missing_len = max_len - len(vocab_list)
    missing_list = missing_len * [vocab_dict["[PAD]"]]
    return(vocab_list + missing_list)
    
label_pipeline = lambda x: int(x)

print(text_pipeline(lines_neg[0], len_counter))

[0, 1, 2, 3, 4, 5, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160]


In [7]:
# copied from tutorial, removed offsets
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def collate_batch(batch):
    label_list, text_list, = [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text, len_counter), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list).view(len(label_list), -1)
    return text_list.to(device), label_list.to(device)

train_iter = list(zip(X_train, y_train))
BATCH_SIZE=128
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

cpu


In [8]:
### build classifier transformer
import torch.nn.functional as F

class MyClassificationTransformer(nn.Module):
    
    def __init__(self, embedding_dim, heads, seq_length, vocab_size, depth=5, num_classes=2):
        super().__init__()

        self.vocab_size = vocab_size
        self.token_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(seq_length, embedding_dim)
        self.num_heads = heads

        # sequence of transformers
        tblocks = []
        for i in range(depth):
            tblocks.append(nn.TransformerEncoderLayer(d_model=embedding_dim,
                                                            nhead=self.num_heads, 
                                                            batch_first=True, dropout=0.1))
        self.tblocks = nn.Sequential(*tblocks)
        
        # final linear layer
        self.last_linear = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        """
        :param x: A (b, t) tensor of integer values representing
                  words (in some predetermined vocabulary).
        :return: A (b, c) tensor of log-probabilities over the
                 classes (where c is the nr. of classes).
        """
        # generate token embeddings
        tokens = self.token_emb(x)
        batch_size, token_size, embed_size = tokens.size()

        # generate position embeddings
        positions = torch.arange(token_size)
#        positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)
        positions = self.pos_emb(positions).expand(batch_size, token_size, embed_size)

        x = tokens + positions
        x = self.tblocks(x)

        # Average-pool over the t dimension and project to class
        # probabilities
        x = self.last_linear(x.mean(dim=1))
        return F.log_softmax(x, dim=1)

In [38]:
## init network and optimizer
# embedding_dim=50, heads=10, depth=2, 20 epochs -> 71.17 accuracy
# embedding_dim=50, heads=10, depth=2, 15 epochs -> 67.70 accuracy
# embedding_dim=50, heads=5, depth=2, 50 epochs -> 69.34
# embedding_dim=30, heads=3, depth=2, 50 epochs -> 70.18
# embedding_dim=30, heads=5, depth=1, 80 epochs -> 69.90

my_classification_transformer = MyClassificationTransformer(embedding_dim=30, heads=5, 
                                            seq_length=len_counter, vocab_size=len(counter_new),
                                            depth=1, num_classes=2)

optimizer = torch.optim.Adam(my_classification_transformer.parameters(), lr=0.001)
criterion = nn.NLLLoss()

In [43]:
# training
from tqdm.notebook import tqdm

num_epochs = 50

for epoch in range(num_epochs):
    print("epoch: ", epoch)
    training_loss = 0
    for x,y in tqdm(dataloader, total=len(X_train)//BATCH_SIZE):

        optimizer.zero_grad()
        out = my_classification_transformer(x)
        loss = criterion(out, y)
        training_loss += loss
        loss.backward()
        optimizer.step()
    
    print("training_loss: ", training_loss)

epoch:  0


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(1.1976, grad_fn=<AddBackward0>)
epoch:  1


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.9332, grad_fn=<AddBackward0>)
epoch:  2


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.7085, grad_fn=<AddBackward0>)
epoch:  3


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.6462, grad_fn=<AddBackward0>)
epoch:  4


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.4160, grad_fn=<AddBackward0>)
epoch:  5


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.3943, grad_fn=<AddBackward0>)
epoch:  6


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.2264, grad_fn=<AddBackward0>)
epoch:  7


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.2416, grad_fn=<AddBackward0>)
epoch:  8


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.2794, grad_fn=<AddBackward0>)
epoch:  9


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.1235, grad_fn=<AddBackward0>)
epoch:  10


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.1111, grad_fn=<AddBackward0>)
epoch:  11


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0757, grad_fn=<AddBackward0>)
epoch:  12


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0711, grad_fn=<AddBackward0>)
epoch:  13


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0926, grad_fn=<AddBackward0>)
epoch:  14


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0598, grad_fn=<AddBackward0>)
epoch:  15


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0362, grad_fn=<AddBackward0>)
epoch:  16


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0312, grad_fn=<AddBackward0>)
epoch:  17


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0280, grad_fn=<AddBackward0>)
epoch:  18


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0243, grad_fn=<AddBackward0>)
epoch:  19


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0187, grad_fn=<AddBackward0>)
epoch:  20


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0203, grad_fn=<AddBackward0>)
epoch:  21


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0202, grad_fn=<AddBackward0>)
epoch:  22


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0211, grad_fn=<AddBackward0>)
epoch:  23


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0163, grad_fn=<AddBackward0>)
epoch:  24


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0130, grad_fn=<AddBackward0>)
epoch:  25


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0111, grad_fn=<AddBackward0>)
epoch:  26


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0084, grad_fn=<AddBackward0>)
epoch:  27


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0091, grad_fn=<AddBackward0>)
epoch:  28


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0105, grad_fn=<AddBackward0>)
epoch:  29


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0080, grad_fn=<AddBackward0>)
epoch:  30


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0096, grad_fn=<AddBackward0>)
epoch:  31


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0074, grad_fn=<AddBackward0>)
epoch:  32


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0096, grad_fn=<AddBackward0>)
epoch:  33


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0076, grad_fn=<AddBackward0>)
epoch:  34


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0073, grad_fn=<AddBackward0>)
epoch:  35


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0053, grad_fn=<AddBackward0>)
epoch:  36


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0052, grad_fn=<AddBackward0>)
epoch:  37


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0049, grad_fn=<AddBackward0>)
epoch:  38


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0055, grad_fn=<AddBackward0>)
epoch:  39


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0045, grad_fn=<AddBackward0>)
epoch:  40


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0052, grad_fn=<AddBackward0>)
epoch:  41


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0052, grad_fn=<AddBackward0>)
epoch:  42


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0041, grad_fn=<AddBackward0>)
epoch:  43


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0041, grad_fn=<AddBackward0>)
epoch:  44


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0048, grad_fn=<AddBackward0>)
epoch:  45


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0036, grad_fn=<AddBackward0>)
epoch:  46


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0031, grad_fn=<AddBackward0>)
epoch:  47


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0032, grad_fn=<AddBackward0>)
epoch:  48


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0058, grad_fn=<AddBackward0>)
epoch:  49


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))


training_loss:  tensor(0.0053, grad_fn=<AddBackward0>)


In [44]:
# test on test data
test_iter = list(zip(X_test, y_test))
test_loader = DataLoader(test_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

correct = []
for x,y in tqdm(test_loader, total=len(X_test)//BATCH_SIZE):
    
    out = my_classification_transformer(x)
    # transform with exp due to logsoftmax
    out_exp = torch.exp(out)
    guessed_class = out_exp.argmax(dim=1)
    correct.append(guessed_class == y)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16.0), HTML(value='')))




In [45]:
c = torch.cat(correct)
print("test accuracy: ", torch.sum(c) / len(c))

test accuracy:  tensor(0.6990)


In [42]:
### base rate is 50:50, so just guessing

np.sum(y_test)/len(y_test)

0.5021097046413502