In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
print(torch.__version__)

1.10.0


# Data

In [3]:
import torchtext

In [4]:
dataset_train, dataset_test = torchtext.datasets.AG_NEWS()
print(len(dataset_train))
print(len(dataset_test))

120000
7600


In [21]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = torchtext.datasets.AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        text = "[SOS] " + text + " [EOS]"
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), min_freq=2,
                                  specials=["[PAD]", "[UNK]", "[SOS]", "[EOS]"])
vocab.set_default_index(vocab["[UNK]"])

In [27]:
#vocab(['here', 'is', 'an', 'example'])
vocab(["the"])
full_vocab = vocab.vocab.get_stoi().keys()
print(len(full_vocab))

53132


In [28]:
import random
import math
import copy

# copied from tutorial, added padding
def text_pipeline(x, max_len):
    x = "[SOS] " + x + " [EOS]"
    vocab_list = np.array(vocab(tokenizer(x)))[:max_len]
    k = len(vocab_list)
    missing_len = max_len - k
    missing_list = missing_len * vocab(["[PAD]"])
    # true labels
    labels = np.concatenate([copy.deepcopy(vocab_list), missing_list])
    labels = np.concatenate([labels[1:], vocab(["[PAD]"])])
    # save vector indicating paddings
    paddings = torch.cat([torch.zeros((k,)), torch.ones((len(missing_list),))])
    
    return(np.concatenate([vocab_list, missing_list]), labels, paddings)

print(text_pipeline('He married Mabel Scott in 1890, but they soon separated. Unable to get an English divorce, in 1900, he became the first celebrity to get one in Nevada, and remarried there, but the divorce was invalid in England. In June 1901, he was arrested for bigamy, and was convicted before the House of Lords, the last time a peer was convicted by the Lords.', 100))

(array([    8,    54,  6619, 47693,  2645,    12,     1,     6,    50,
          72,   749, 10620,     4,  4310,     9,   227,    35,  1889,
       13154,     6,    12,     1,     6,    54,  1363,     5,    52,
        7959,     9,   227,    66,    12,  6424,     6,    13,     1,
         234,     6,    50,     5, 13154,    40, 15493,    12,   320,
           4,    12,  1921, 30443,     6,    54,    40,   799,    16,
           1,     6,    13,    40,  2492,   172,     5,   441,    11,
        9521,     6,     5,    74,   106,    10,  8111,    40,  2492,
          29,     5,  9521,     4,     7,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0]), array([   54,  6619, 47693,  2645,    12,     1,     6,    50,    72,
         749, 10620,     4,  4310,     9,   227,    35,  1889, 13154,
           6,    12,     1,     6,    54,  1363,     5,    52,  7959,
   

In [33]:
# copied from tutorial, removed offsets
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def collate_batch(batch):
    label_list, text_list, padding_list = [], [], []
    for _, text in batch:
        input_, label_, padding_ = text_pipeline(text, max_len=100)
        text_list.append(torch.tensor(input_, dtype=torch.int64))
        label_list.append(label_)
        padding_list.append(torch.tensor(padding_))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list).view(len(label_list), -1)
    padding_list = torch.cat(padding_list).view(len(label_list), -1)
    return text_list.to(device), label_list.to(device), padding_list

train_iter = dataset_train
BATCH_SIZE = 32
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_batch)

cpu


# GPT model

In [67]:
### build classifier transformer
import torch.nn.functional as F

class MyGPT(nn.Module):
    
    def __init__(self, embedding_dim, heads, seq_length, vocab_size, depth=5, num_classes=2):
        super().__init__()

        self.vocab_size = vocab_size
        self.token_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(seq_length, embedding_dim)
        self.num_heads = heads
        indices = torch.triu_indices(seq_length, seq_length, offset=1)
        self.attn_mask = torch.zeros((seq_length, seq_length))
        self.attn_mask[indices[0], indices[1]] = float("-inf")

        # sequence of transformers
        self.tblocks = []
        for i in range(depth):
            self.tblocks.append(nn.TransformerDecoderLayer(d_model=embedding_dim,
                                                            nhead=self.num_heads, 
                                                            batch_first=True, dropout=0.1))
        #self.tblocks = nn.Sequential(*tblocks)
        
        # final linear layer
        self.last_linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x, paddings):
        # generate token embeddings
        tokens = self.token_emb(x)
        batch_size, token_size, embed_size = tokens.size()

        # generate position embeddings
        #positions = torch.arange(token_size)
        #positions = self.pos_emb(positions).expand(batch_size, token_size, embed_size)

        # x = tokens + positions
        x = tokens
        for block in self.tblocks:
            x = block(x, memory=x, tgt_mask=self.attn_mask, tgt_key_padding_mask=paddings)

        # predict next word with last_linear
        out = self.last_linear(x)
        out = out.transpose(2, 1)
        return out

# Training

In [68]:
my_gpt = MyGPT(embedding_dim=30, heads=5, 
                        seq_length=100, vocab_size=len(full_vocab),
                        depth=1)

optimizer = torch.optim.Adam(my_gpt.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [69]:
# training
from tqdm.notebook import tqdm

num_epochs = 20

for epoch in range(num_epochs):
    print("epoch: ", epoch)
    training_loss = 0
    for i, (x, y, paddings) in tqdm(enumerate(dataloader), total=len(dataset_train)//BATCH_SIZE):

        if i > 50: break
        optimizer.zero_grad()

        out = my_gpt(x, paddings)
        
        loss = criterion(out, y)
        training_loss += loss
        loss.backward()
        optimizer.step()
    
    print("training_loss: ", training_loss)

epoch:  0


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3750.0), HTML(value='')))

  padding_list.append(torch.tensor(padding_))
  label_list = torch.tensor(label_list, dtype=torch.int64)





KeyboardInterrupt: 