# Building a sentiment classifier

Using the sentence polarity dataset: http://www.cs.cornell.edu/people/pabo/movie-review-data/

We train a small transformer to do sentiment classification of sentences from the polarity dataset. We achieve a 70% accuracy in most settings. The guessing rate would be 50%, so our transformer learned something. We expect that one could improve this accuracy by using bigger models or applying some DL magic but the point of this endeavor was to get the pipeline running in the first place.

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
print(torch.__version__)

1.10.0+cu102


In [2]:
print(torch.cuda.is_available())

True


In [3]:
#!pip install torchtext==0.11.0

In [4]:
### load our data
file_location_neg = "./data/rt-polaritydata/rt-polaritydata/rt-polarity.neg"
file_location_pos = "./data/rt-polaritydata/rt-polaritydata/rt-polarity.pos"
with open(file_location_neg, 'rb') as f:
    lines_neg = f.readlines()
with open(file_location_pos, 'rb') as f:
    lines_pos = f.readlines()
    
lines_neg = [x.decode("utf-8", "ignore") for x in lines_neg]
lines_pos = [x.decode("utf-8", "ignore") for x in lines_pos]

labels_neg = [0] * len(lines_neg)
labels_pos = [1] * len(lines_pos)

lines = lines_neg + lines_pos
labels = labels_neg + labels_pos
print(len(lines))
print(len(labels))

10662
10662


In [5]:
## create train-test split with shuffled data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(lines, labels, test_size=0.2, random_state=42, shuffle=True)
print(len(X_train))
print(len(y_test))

8529
2133


In [6]:
### take pipeline from https://torchtutorialstaging.z5.web.core.windows.net/beginner/text_sentiment_ngrams_tutorial.html
from torchtext.data.utils import get_tokenizer
from collections import Counter
#from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
len_counter = 0
for line in lines:
    toks = tokenizer(line)
    if len(toks) > len_counter:
        len_counter = len(toks)
    counter.update(toks)
# folllowing line from https://stackoverflow.com/a/15862037/10526100
# removes all words with frequency of one from counter
counter_new = Counter({k: c for k, c in counter.items() if c > 1})
counter_new.update(["[PAD]", "[UNK]"])
ids = list(range(len(counter_new)))
vocab_dict = dict(zip(counter_new.keys(), ids))
print([vocab_dict[token] for token in ['here', 'is', 'an', 'example']])
print(vocab_dict["[PAD]"])
print(len_counter)

[188, 55, 81, 4949]
10160
62


In [7]:
# copied from tutorial, added padding
def text_pipeline(x, max_len):
    vocab_list = [vocab_dict[token] if token in vocab_dict else vocab_dict["[UNK]"] for token in tokenizer(x) ]
    missing_len = max_len - len(vocab_list)
    missing_list = missing_len * [vocab_dict["[PAD]"]]
    return(vocab_list + missing_list)
    
label_pipeline = lambda x: int(x)

print(text_pipeline(lines_neg[0], len_counter))

[0, 1, 2, 3, 4, 5, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160, 10160]


In [8]:
# copied from tutorial, removed offsets
from torch.utils.data import DataLoader
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda"
print(device)

def collate_batch(batch):
    label_list, text_list, = [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text, len_counter), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list).view(len(label_list), -1)
    return text_list.to(device), label_list.to(device)

train_iter = list(zip(X_train, y_train))
BATCH_SIZE=128
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

cuda


In [17]:
### build classifier transformer
import torch.nn.functional as F

class MyClassificationTransformer(nn.Module):
    
    def __init__(self, embedding_dim, heads, seq_length, vocab_size, device, depth=5, num_classes=2):
        super().__init__()

        self.vocab_size = vocab_size
        self.token_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(seq_length, embedding_dim)
        self.num_heads = heads
        self.device = device

        # sequence of transformers
        tblocks = []
        for i in range(depth):
            tblocks.append(nn.TransformerEncoderLayer(d_model=embedding_dim,
                                                            nhead=self.num_heads, 
                                                            batch_first=True, dropout=0.1))
        self.tblocks = nn.Sequential(*tblocks)
        
        # final linear layer
        self.last_linear = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        """
        :param x: A (b, t) tensor of integer values representing
                  words (in some predetermined vocabulary).
        :return: A (b, c) tensor of log-probabilities over the
                 classes (where c is the nr. of classes).
        """
        # generate token embeddings
        tokens = self.token_emb(x)
        batch_size, token_size, embed_size = tokens.size()

        # generate position embeddings
        positions = torch.arange(token_size).to(self.device)
        positions = self.pos_emb(positions).expand(batch_size, token_size, embed_size).to(self.device)

        x = tokens + positions
        x = self.tblocks(x)

        # Average-pool over the t dimension and project to class
        # probabilities
        x = self.last_linear(x.mean(dim=1))
        return F.log_softmax(x, dim=1)

In [18]:
## init network and optimizer
# embedding_dim=50, heads=10, depth=2, 20 epochs -> 71.17 accuracy
# embedding_dim=50, heads=10, depth=2, 15 epochs -> 67.70 accuracy
# embedding_dim=50, heads=5, depth=2, 50 epochs -> 69.34
# embedding_dim=30, heads=3, depth=2, 50 epochs -> 70.18
# embedding_dim=30, heads=5, depth=1, 80 epochs -> 69.90
# embedding_dim=50, heads=10, depth=2, 200 epochs -> 70.18 accuracy

my_classification_transformer = MyClassificationTransformer(embedding_dim=50, heads=10, 
                                            seq_length=len_counter, vocab_size=len(counter_new),
                                            device=device, depth=2, num_classes=2).to(device)

optimizer = torch.optim.Adam(my_classification_transformer.parameters(), lr=0.001)
criterion = nn.NLLLoss()

In [27]:
# training
from tqdm.notebook import tqdm

num_epochs = 20

for epoch in range(num_epochs):
    print("epoch: ", epoch)
    training_loss = 0
    for x,y in tqdm(dataloader, total=len(X_train)//BATCH_SIZE):

        optimizer.zero_grad()
        out = my_classification_transformer(x)
        loss = criterion(out, y)
        training_loss += loss
        loss.backward()
        optimizer.step()
    
    print("training_loss: ", training_loss)

epoch:  0


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1495, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  1


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1481, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  2


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1498, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  3


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1486, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  4


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1485, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  5


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1481, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  6


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1508, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  7


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1471, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  8


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1486, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  9


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1494, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  10


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1515, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  11


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1486, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  12


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1496, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  13


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1509, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  14


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1499, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  15


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1484, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  16


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1493, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  17


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1483, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  18


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1483, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  19


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1479, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  20


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1496, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  21


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1478, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  22


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1481, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  23


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1492, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  24


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1488, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  25


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1494, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  26


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1875, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  27


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1488, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  28


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1486, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  29


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1472, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  30


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1491, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  31


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1498, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  32


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1488, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  33


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1505, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  34


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1481, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  35


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1495, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  36


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1507, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  37


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1492, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  38


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1488, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  39


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1484, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  40


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1496, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  41


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1487, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  42


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1490, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  43


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1474, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  44


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1483, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  45


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1518, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  46


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1477, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  47


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1505, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  48


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1473, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  49


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.4051, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  50


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(3.6222, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  51


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.4591, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  52


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.4647, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  53


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.4199, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  54


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1837, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  55


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1335, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  56


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1449, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  57


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1329, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  58


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1297, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  59


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1295, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  60


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1228, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  61


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1246, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  62


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1187, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  63


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1193, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  64


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1137, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  65


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1052, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  66


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1199, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  67


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0961, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  68


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0850, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  69


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0828, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  70


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0800, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  71


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0781, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  72


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0772, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  73


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0768, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  74


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0778, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  75


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0769, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  76


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0762, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  77


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0757, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  78


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0753, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  79


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0758, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  80


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0816, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  81


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0754, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  82


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0744, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  83


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0761, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  84


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0789, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  85


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0760, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  86


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0740, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  87


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0729, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  88


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0693, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  89


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0707, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  90


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0632, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  91


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0795, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  92


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0592, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  93


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0519, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  94


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(1.7308, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  95


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(5.1599, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  96


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(3.7987, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  97


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(4.3883, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  98


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(4.3746, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  99


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(3.2687, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  100


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(1.7991, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  101


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.7831, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  102


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.4230, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  103


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.3146, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  104


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1658, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  105


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1687, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  106


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0843, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  107


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0911, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  108


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0684, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  109


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0704, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  110


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0691, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  111


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0956, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  112


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0635, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  113


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0864, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  114


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0678, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  115


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0575, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  116


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0560, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  117


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0527, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  118


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0483, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  119


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0651, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  120


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0517, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  121


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0525, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  122


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0927, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  123


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0537, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  124


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0500, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  125


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0475, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  126


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0716, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  127


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0440, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  128


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0539, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  129


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0518, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  130


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0480, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  131


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0458, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  132


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0494, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  133


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0446, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  134


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0728, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  135


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0559, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  136


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0483, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  137


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0450, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  138


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0456, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  139


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0417, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  140


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0429, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  141


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0354, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  142


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0581, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  143


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0418, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  144


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0488, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  145


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0424, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  146


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0424, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  147


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0390, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  148


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0321, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  149


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0478, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  150


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0438, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  151


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0365, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  152


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0368, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  153


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0308, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  154


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0297, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  155


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0257, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  156


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0226, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  157


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0255, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  158


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0126, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  159


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0688, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  160


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.2472, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  161


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.7410, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  162


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1648, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  163


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0508, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  164


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0435, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  165


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0396, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  166


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0338, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  167


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0244, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  168


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0228, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  169


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0308, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  170


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0227, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  171


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0165, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  172


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0237, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  173


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0253, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  174


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0090, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  175


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0055, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  176


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0052, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  177


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0035, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  178


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0024, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  179


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0023, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  180


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0018, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  181


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0020, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  182


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0015, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  183


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0013, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  184


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0896, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  185


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0326, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  186


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0055, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  187


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0738, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  188


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0106, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  189


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0303, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  190


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0034, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  191


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0095, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  192


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1448, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  193


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.1219, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  194


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0912, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  195


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0704, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  196


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0418, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  197


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0030, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  198


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0025, device='cuda:0', grad_fn=<AddBackward0>)
epoch:  199


  0%|          | 0/66 [00:00<?, ?it/s]

training_loss:  tensor(0.0548, device='cuda:0', grad_fn=<AddBackward0>)


In [28]:
# test on test data
test_iter = list(zip(X_test, y_test))
test_loader = DataLoader(test_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

correct = []
for x,y in tqdm(test_loader, total=len(X_test)//BATCH_SIZE):
    
    out = my_classification_transformer(x)
    # transform with exp due to logsoftmax
    out_exp = torch.exp(out)
    guessed_class = out_exp.argmax(dim=1)
    correct.append(guessed_class == y)

  0%|          | 0/16 [00:00<?, ?it/s]

In [29]:
c = torch.cat(correct)
print("test accuracy: ", torch.sum(c) / len(c))

test accuracy:  tensor(0.7018, device='cuda:0')


In [30]:
### base rate is 50:50, so just guessing

np.sum(y_test)/len(y_test)

0.5021097046413502