### Sentiment analysis with an attention transformer
* sentiment bit adapted from: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
* transformer from: bloem
* data loader copy-paste from https://github.com/ben0it8/containerized-transformer-finetuning/blob/develop/research/finetune-transformer-on-imdb5k.ipynb

In [87]:
import torch
import torch.nn as nn
from torchtext import data, datasets, vocab

import numpy as np
import warnings
import re
import time
import os
import logging
import pandas as pd
import random
import sys
import math
import gzip

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from tqdm import tqdm_notebook as tqdm



eps = np.finfo(float).eps

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
# Used for converting between nats and bits
LOG2E = math.log2(math.e)
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)
NUM_CLS = 2
vocab_size = 50000

### Data loaders

In [91]:
tdata, _ = datasets.IMDB.splits(TEXT, LABEL)
train, test = tdata.split(split_ratio=0.8)

TEXT.build_vocab(train, max_size=vocab_size - 2) # - 2 to make space for <unk> and <pad>
LABEL.build_vocab(train)

train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=4, device=device)


In [92]:
print(f'- nr. of training examples {len(train_iter)}')
print(f'- nr. of validation examples {len(test_iter)}')

- nr. of training examples 5000
- nr. of validation examples 1250


In [93]:
mx = max([input.text[0].size(1) for input in train_iter])
mx = mx * 2
print(f'- maximum sequence length: {mx}')

- maximum sequence length: 4940


### Model definitions

In [112]:
class AttentionLayer(nn.Module):
    def __init__(self, k, num_heads=8):
        super(AttentionLayer, self).__init__()
        self.k = k
        self.num_heads = num_heads
        
        # determine queries, keys, values
        self.key_layer = nn.Linear(self.k, self.k * self.num_heads, bias=False)
        self.query_layer = nn.Linear(self.k, self.k * self.num_heads, bias=False)
        self.value_layer = nn.Linear(self.k, self.k * self.num_heads, bias=False)
        
        # project down all cat-ed heads
        self.unify_layer = nn.Linear(self.num_heads * k, k)
    
    def forward(self, x):
        # get batch size, t sentences of k items
        b_sz, t_sz, k_sz = x.size()
        h_sz = self.num_heads
        
        keys = self.key_layer(x).view(b_sz, t_sz, h_sz, self.k)
        queries = self.query_layer(x).view(b_sz, t_sz, h_sz, self.k)
        values = self.value_layer(x).view(b_sz, t_sz, h_sz, self.k)
    
        # compute dot products (k x k). Same op for every head, so fold in to the
        # batch dim
        # q, k, v, (b, t, h, k) -> (b, h, t, k) -> (bh, t, k)
        # and for the key (bh, t, k) -> (bh, k, t) to be able to use bmm
        #
        keys = keys.transpose(1, 2).contiguous().view(b_sz * h_sz, t_sz, k_sz)
        queries = queries.transpose(1, 2).contiguous().view(b_sz * h_sz, t_sz, k_sz)
        values = values.transpose(1, 2).contiguous().view(b_sz * h_sz, t_sz, k_sz)
        
        # intermediate scaling
        queries = queries / ( self.k  ** (1./4.))
        keys = keys / ( self.k  ** (1./4.))
        
        # final transpose for the bmm, out -> (b*h, t, t)
        raw_weights = torch.bmm(queries, keys.transpose(1, 2))
        
        # row wise softmax normalize
        weights = F.softmax(raw_weights, dim=2)
        
        # apply self attention to the values
        out = torch.bmm(weights, values).view(b_sz, h_sz, t_sz, k_sz)
        
        # Unify attention heads
        # reshuffle (b, h, t, k) -> (b, t, h, k) -> (b, t, h*k) with all the heads catted
        # ontop of each other to be able to down project
        out = out.transpose(1, 2).contiguous().view(b_sz, t_sz, h_sz * k_sz)
        
        # project down
        out = self.unify_layer(out)
        
        return out
    
    
class TransformerBlock(nn.Module):
    def __init__(self, k, num_heads):
        super(TransformerBlock, self).__init__()
        
        self.attention = AttentionLayer(k, num_heads)
        
        self.layer_norm1 = nn.LayerNorm(k)
        self.layer_norm2 = nn.LayerNorm(k)
        
        self.mlp = nn.Sequential(
            nn.Linear(k, 4 * k),
            nn.ReLU(),
            nn.Linear(4 * k, k)
        )
    
    def forward(self, x):
        # Attention block
        x_att = self.attention(x)
        # Residual + norm
        x = self.layer_norm1(x + x_att)
        # MLP
        x_mlp = self.mlp(x)
        out = self.layer_norm2(x + x_mlp)
        return out
    
    
class Transformer(nn.Module):
    def __init__(self, k, num_heads, depth, num_tokens, num_classes):
        super(Transformer, self).__init__()
        
        self.num_tokens = num_tokens
        
        # Embedding tokens and position layers
        self.token_embed_layer = nn.Embedding(num_tokens, k)
        
        # Transformer blocks
        self.tf_network = []
        for _ in range(depth):
            self.tf_network.append(TransformerBlock(k, num_heads))

        self.tf_network = nn.Sequential(*self.tf_network)
        
        # Sequence to class output
        self.output_layer = nn.Linear(k, num_classes)
        
    def forward(self, x):
        # in (b, t) tensor with int values representing words
        # out (b, c) tensor logprobs over c classes
        
        # generate token embeddings
        tokens = self.token_embed_layer(x)
        
        b_sz, t_sz, k_sz = tokens.size()
        
        # Transformer forward
        x = self.tf_network(tokens)
        
        # Average pool over t dimension and project to class probabilities
        x = self.output_layer(x.mean(dim=1))
        
        # Optional (auto-regressive) transformer
        # no looking ahead, enforce via mask, prior to softmax
#         indices = torch.triu_indices(t, t, offset=1)
#         x[:, indices[0], indices[1]] = float('-inf')
        
        out = F.log_softmax(x, dim=1)
        
        return out  

### Settings

In [113]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 128
NUM_CLS = 2
vocab_size = 50000
num_heads = 8
depth = 6
# k, num_heads, depth, seq_length, num_tokens, num_
model = Transformer(EMBED_DIM, num_heads, depth, vocab_size, NUM_CLS).to(device)

In [114]:
lr_warmup = 10000
batch_size = 4
lr = 1e-3
opt = torch.optim.Adam(lr=lr, params=model.parameters())
sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (lr_warmup / batch_size), 1.0))


### Main epoch transformer loop

In [115]:
for e in range(1):

    print(f'\n epoch {e}')
    model.train(True)

    for batch in tqdm(train_iter):

        opt.zero_grad()

        input = batch.text[0]
        label = batch.label - 1

        if input.size(1) > mx:
            input = input[:, :mx]
        out = model(input)
        loss = nn.functional.nll_loss(out, label)

        loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
#         if arg.gradient_clipping > 0.0:
#             nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()
        sch.step()

    with torch.no_grad():

        model.train(False)
        tot, cor= 0.0, 0.0

        for batch in test_iter:

            input = batch.text[0]
            label = batch.label - 1

            if input.size(1) > mx:
                input = input[:, :mx]
            out = model(input).argmax(dim=1)

            tot += float(input.size(0))
            cor += float((label == out).sum().item())

        acc = cor / tot
        print(f'-- {"validation"} accuracy {acc:.3}')



 epoch 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
# add position back
# run on gallager
# graph data