In [1]:
"""
In this notebook , i get the transformer example from official documentation.
Then i created utilities to print output at every step.
I print all tensors with line information and explanation.
"""
%matplotlib inline
from tabulate import tabulate
from IPython.display import HTML as html_print

In [18]:
class StepLogger():
    def __init__(self,capacity):
        self.tensor_datas = {}        
        self.capacity = capacity
        self.added_labels = []
        self.comments = []
        
    
    def add_info(self,tensor_data,tensor_label,comment=""):
        if tensor_label not in self.added_labels:
            self.added_labels.append( tensor_label )
            self.comments.append( comment )
        
        if tensor_label in self.tensor_datas.keys():
            current_arr = self.tensor_datas.get(tensor_label)
            if len(current_arr) < self.capacity:
                current_arr = self.tensor_datas.get(tensor_label, [])
                current_arr.append(tensor_data)
        else:
            self.tensor_datas[tensor_label] = [tensor_data]
    
    def get_default_summary(self,show_data=False,summary_count=1):
        self.get_summary(self.added_labels,show_data,summary_count)        
        
    def to_html_summary(self,show_data=False,summary_count=1):
        print("summary_count",summary_count,"   self.capacity ",self.capacity)
        #table = [['one',''],['',s1],['',""]]
        datas = []
        count = 0
        for i in range(summary_count):
            
            print(i," ------------------------------------------------")
            for l,comment in zip(self.added_labels,self.comments):
                
                label_data = self.tensor_datas.get(l)[i]
                datas.append(["-----","-----"])
                datas.append(["",""])
                datas.append(["",""])
                datas.append([l,""])
                datas.append(["    "+comment,""])
                
                if torch.is_tensor(label_data):
                    datas.append( ["", list(label_data.size() ) ])
                if not show_data and not torch.is_tensor(label_data):
                    datas.append(["",label_data])
                if show_data:    
                    datas.append(["",label_data])
                #print("subdata",subdata)    
                #datas.append(subdata)
        #print("datas",datas)
        tabulated = tabulate(datas,headers=['Variable', 'Data'], tablefmt='html')  
        #print("tabulated",tabulated)      
        to_html = html_print(tabulated)   
        #print("to_html ",to_html)   
        return to_html ,tabulated          
          
    def get_summary(self,labels,show_data=False,summary_count=1):
        print("summary_count",summary_count,"   self.capacity ",self.capacity)
        count = 0
        for i in range(summary_count):
            print(i," ------------------------------------------------")
            for l in labels:
                label_data = self.tensor_datas.get(l)[i]
                print(l)
                if torch.is_tensor(label_data):
                    print( list(label_data.size() ) )
                if not show_data and not torch.is_tensor(label_data):
                    print(label_data)
                if show_data:    
                    print(label_data)

In [19]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        step_logger.add_info(mask,"TransformerModel _generate_square_subsequent_mask","mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)")
        #[35, 35]
        #tensor([[ True, False, False,  ..., False, False, False],
        #[ True,  True, False,  ..., False, False, False],
        #[ True,  True,  True,  ..., False, False, False],
        #...,

        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        step_logger.add_info(mask,"TransformerModel _generate_square_subsequent_mask 2","mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))")
        #[35, 35]
        #tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        #[0., 0., -inf,  ..., -inf, -inf, -inf],
        #[0., 0., 0.,  ..., -inf, -inf, -inf],
        #...,
        #[0., 0., 0.,  ..., 0., -inf, -inf],
        #[0., 0., 0.,  ..., 0., 0., -inf],
        #[0., 0., 0.,  ..., 0., 0., 0.]])

        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            step_logger.add_info(mask,"TransformerModel forward mask init ","self._generate_square_subsequent_mask(len(src)).to(device)")
            #[35, 35]
            #tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
            # [0., 0., -inf,  ..., -inf, -inf, -inf],
            #[0., 0., 0.,  ..., -inf, -inf, -inf],
            #...,
            #[0., 0., 0.,  ..., 0., -inf, -inf],
            #[0., 0., 0.,  ..., 0., 0., -inf],
            #[0., 0., 0.,  ..., 0., 0., 0.]])
            self.src_mask = mask
            
        step_logger.add_info(src,"TransformerModel forward src","def forward(self, src)")
        #[35, 20]
        #tensor([[   14,   148,    36,     6,     8,  1851,   683,    11,     6,   296,
        # 25714, 17163,   256,    19,   326,     5,  2670,    22,     4,    22],
        #[    3,    22, 10588,     3,    10,   973,    18,  2069,  2165,     7,
        #     7,    37,  9283,   686,    12,  3692,  5020,    40,  3188,     4],
        #.............
        step_logger.add_info(self.ninp,"TransformerModel forward ninp")
        #200
         
        #self.encoder = nn.Embedding(ntoken, ninp)
        src = self.encoder(src) * math.sqrt(self.ninp)
        step_logger.add_info(src,"TransformerModel forward encoder","self.encoder = nn.Embedding(ntoken, ninp)")
        #[35, 20, 200]
        # tensor([[[-0.4198, -0.7750, -0.7053,  ..., -0.5248, -0.5563,  1.0408],
        # [-0.7180,  0.4915, -0.1442,  ...,  0.7179,  0.6439, -0.5294],
        # [-0.6988,  1.1533,  1.3869,  ...,  0.9217,  0.1053,  0.4692],
        # ...,
        # [ 1.2159,  0.5915, -0.1931,  ..., -0.2329,  0.2632, -0.1759],
        # [-0.7472, -0.1905,  0.5676,  ..., -0.3970, -0.4741, -1.0768],
        # [ 1.2159,  0.5915, -0.1931,  ..., -0.2329,  0.2632, -0.1759]],


        src = self.pos_encoder(src)
        step_logger.add_info(src,"TransformerModel forward pos_encoder","src = self.pos_encoder(src)")
        #[35, 20, 200]
        #tensor([[[-0.0000e+00,  2.8131e-01, -8.8157e-01,  ...,  5.9396e-01,
        #  -6.9536e-01,  0.0000e+00],

        output = self.transformer_encoder(src, self.src_mask)
        step_logger.add_info(output,"TransformerModel transformer_encoder output","output = self.transformer_encoder(src, self.src_mask)")
        #[35, 20, 200]
        #tensor([[[-8.9633e-01, -1.2172e+00, -6.3429e-01,  ...,  1.9572e-01,
        #  -7.4439e-01,  8.3845e-01],

        #self.decoder = nn.Linear(ninp, ntoken)
        output = self.decoder(output)
        step_logger.add_info(output,"TransformerModel decoder output","output = self.decoder(output)")
        #[35, 20, 28871]
        #tensor([[[-8.1877e-02,  3.2367e-01,  4.9415e-01,  ..., -1.6005e+00,
        #   7.6720e-01,  8.4653e-01],

        return output

In [20]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        step_logger.add_info(div_term,"PositionalEncoding  div_term","div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))")

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        step_logger.add_info(pe,"PositionalEncoding  pe")

        pe = pe.unsqueeze(0).transpose(0, 1)
        step_logger.add_info(pe,"PositionalEncoding  pe unsqueeze","pe = pe.unsqueeze(0).transpose(0, 1)")

        self.register_buffer('pe', pe)

    def forward(self, x):
        step_logger.add_info(x,"PositionalEncoding  forward","def forward(self, x):")
        #[35, 20, 200]
        #tensor([[[-0.4198, -0.7750, -0.7053,  ..., -0.5248, -0.5563,  1.0408],
        # [-0.7180,  0.4915, -0.1442,  ...,  0.7179,  0.6439, -0.5294],
        # [-0.6988,  1.1533,  1.3869,  ...,  0.9217,  0.1053,  0.4692],
        # ...,

        x = x + self.pe[:x.size(0), :]
        step_logger.add_info(x,"PositionalEncoding  forward self.pe","x = x + self.pe[:x.size(0), :]")
        #[35, 20, 200]
        #tensor([[[-4.1976e-01,  2.2504e-01, -7.0526e-01,  ...,  4.7516e-01,
        #  -5.5629e-01,  2.0408e+00],

        return self.dropout(x)

In [21]:
import torchtext
from torchtext.data.utils import get_tokenizer
TEXT = torchtext.data.Field(tokenize=get_tokenizer("spacy"),
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)
train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

In [22]:

print( train_txt.examples[0].__dict__["text"][30:50] )
print( len(train_txt.examples[0].__dict__["text"] ))

print( test_txt.examples[0].__dict__["text"][30:50] )
print( len(test_txt.examples[0].__dict__["text"] ))

['the', 'battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', ',', 'is', 'a', 'tactical', 'role', '@-@']
2236652
['guest', '@-@', 'starring', 'role', 'on', 'the', 'television', 'series', 'the', 'bill', 'in', '2000', '.', 'this', 'was', 'followed', 'by', 'a', 'starring', 'role']
280576


In [23]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    
    #print("data",i,i+seq_len) #data 0 35
    #print("target",i+1,i+1+seq_len) #target 1 36

    #print("source",source.size() ) #source torch.Size([111832, 20])


    data = source[i:i+seq_len]
    #print("data",data.size() ) #data torch.Size([35, 20])

    target = source[i+1:i+1+seq_len].view(-1)
    #print("target",target.size() ) #target torch.Size([700])

    return data, target

In [24]:
data, targets = get_batch(train_data, 0)
targets.size()
#data

torch.Size([700])

In [25]:
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
step_logger = StepLogger(2)
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)


In [26]:
print(train_data.size() )
print(train_data[0] )
#for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
#  print(batch,i)

torch.Size([111832, 20])
tensor([   14,   148,    36,     6,     8,  1851,   683,    11,     6,   296,
        25714, 17163,   256,    19,   326,     5,  2670,    22,     4,    22],
       device='cuda:0')


In [27]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi) #28871
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
    #for batch, i in enumerate(range(0, 300, bptt)):  
        data, targets = get_batch(train_data, i)
        
        optimizer.zero_grad()

        step_logger.add_info(data,"train data")
        #[35, 20]
        #tensor([[   14,   148,    36,     6,     8,  1851,   683,    11,     6,   296,
        # 25714, 17163,   256,    19,   326,     5,  2670,    22,     4,    22],
        #[    3,    22, 10588,     3,    10,   973,    18,  2069,  2165,     7,
        #     7,    37,  9283,   686,    12,  3692,  5020,    40,  3188,     4],
        #.........................................
        
        #target is 1 step after train, predicting next sequence
        step_logger.add_info(targets,"train targets")
        #[700]
        #[    3,    22, 10588,     3,    10,   973,    18,  2069,  2165,     7,
        #    7,    37,  9283,   686,    12,  3692,  5020,    40,  3188,     4,
        #   14,   674,     7,    14,     9,     6,  2642,   483,  9818,    41,
        #   .............................

        output = model(data)
        step_logger.add_info(output,"train output")
        #[35, 20, 28871]
        #tensor([[[-8.1877e-02,  3.2367e-01,  4.9415e-01,  ..., -1.6005e+00,
        #   7.6720e-01,  8.4653e-01],

        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [28]:
best_val_loss = float("inf")
epochs = 1 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()



| epoch   1 |   200/ 3195 batches | lr 5.00 | ms/batch 17.22 | loss  7.32 | ppl  1511.88
| epoch   1 |   400/ 3195 batches | lr 5.00 | ms/batch 16.91 | loss  6.27 | ppl   528.88
| epoch   1 |   600/ 3195 batches | lr 5.00 | ms/batch 16.96 | loss  5.97 | ppl   391.32
| epoch   1 |   800/ 3195 batches | lr 5.00 | ms/batch 17.00 | loss  5.80 | ppl   330.58
| epoch   1 |  1000/ 3195 batches | lr 5.00 | ms/batch 17.07 | loss  5.81 | ppl   332.22
| epoch   1 |  1200/ 3195 batches | lr 5.00 | ms/batch 17.14 | loss  5.75 | ppl   313.47
| epoch   1 |  1400/ 3195 batches | lr 5.00 | ms/batch 17.17 | loss  5.72 | ppl   303.55
| epoch   1 |  1600/ 3195 batches | lr 5.00 | ms/batch 17.25 | loss  5.61 | ppl   272.41
| epoch   1 |  1800/ 3195 batches | lr 5.00 | ms/batch 17.37 | loss  5.63 | ppl   278.93
| epoch   1 |  2000/ 3195 batches | lr 5.00 | ms/batch 17.40 | loss  5.63 | ppl   279.25
| epoch   1 |  2200/ 3195 batches | lr 5.00 | ms/batch 17.45 | loss  5.57 | ppl   261.33
| epoch   1 |  2400/ 

In [29]:
step_logger.get_default_summary()

summary_count 1    self.capacity  2
0  ------------------------------------------------
PositionalEncoding  div_term
[100]
PositionalEncoding  pe
[5000, 200]
PositionalEncoding  pe unsqueeze
[5000, 1, 200]
train data
[35, 20]
train targets
[700]
TransformerModel _generate_square_subsequent_mask
[35, 35]
TransformerModel _generate_square_subsequent_mask 2
[35, 35]
TransformerModel forward mask init 
[35, 35]
TransformerModel forward src
[35, 20]
TransformerModel forward ninp
200
TransformerModel forward encoder
[35, 20, 200]
PositionalEncoding  forward
[35, 20, 200]
PositionalEncoding  forward self.pe
[35, 20, 200]
TransformerModel forward pos_encoder
[35, 20, 200]
TransformerModel transformer_encoder output
[35, 20, 200]
TransformerModel decoder output
[35, 20, 28871]
train output
[35, 20, 28871]


In [30]:
h = step_logger.to_html_summary(True)
display(h[0])

summary_count 1    self.capacity  2
0  ------------------------------------------------


Variable,Data
-----,-----
,
,
PositionalEncoding div_term,
"div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))",
,[100]
,"tensor([1.0000e+00, 9.1201e-01, 8.3176e-01, 7.5858e-01, 6.9183e-01, 6.3096e-01,  5.7544e-01, 5.2481e-01, 4.7863e-01, 4.3652e-01, 3.9811e-01, 3.6308e-01,  3.3113e-01, 3.0200e-01, 2.7542e-01, 2.5119e-01, 2.2909e-01, 2.0893e-01,  1.9055e-01, 1.7378e-01, 1.5849e-01, 1.4454e-01, 1.3183e-01, 1.2023e-01,  1.0965e-01, 1.0000e-01, 9.1201e-02, 8.3176e-02, 7.5858e-02, 6.9183e-02,  6.3096e-02, 5.7544e-02, 5.2481e-02, 4.7863e-02, 4.3652e-02, 3.9811e-02,  3.6308e-02, 3.3113e-02, 3.0200e-02, 2.7542e-02, 2.5119e-02, 2.2909e-02,  2.0893e-02, 1.9055e-02, 1.7378e-02, 1.5849e-02, 1.4454e-02, 1.3183e-02,  1.2023e-02, 1.0965e-02, 1.0000e-02, 9.1201e-03, 8.3176e-03, 7.5858e-03,  6.9183e-03, 6.3096e-03, 5.7544e-03, 5.2481e-03, 4.7863e-03, 4.3652e-03,  3.9811e-03, 3.6308e-03, 3.3113e-03, 3.0200e-03, 2.7542e-03, 2.5119e-03,  2.2909e-03, 2.0893e-03, 1.9055e-03, 1.7378e-03, 1.5849e-03, 1.4454e-03,  1.3183e-03, 1.2023e-03, 1.0965e-03, 1.0000e-03, 9.1201e-04, 8.3176e-04,  7.5858e-04, 6.9183e-04, 6.3096e-04, 5.7544e-04, 5.2481e-04, 4.7863e-04,  4.3652e-04, 3.9811e-04, 3.6308e-04, 3.3113e-04, 3.0200e-04, 2.7542e-04,  2.5119e-04, 2.2909e-04, 2.0893e-04, 1.9055e-04, 1.7378e-04, 1.5849e-04,  1.4454e-04, 1.3183e-04, 1.2023e-04, 1.0965e-04])"
-----,-----
,
,


In [31]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  5.00 | test ppl   148.34
