In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pprint
import math
import numpy as np
import time

In [2]:
from functions import *

# Torch Module

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 512):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size).to(device)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Autoregressive(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(Autoregressive, self).__init__()

        # Embedding layer
        self.embedding = TokenEmbedding(vocab_size, embed_size).to(device)
        # positional Encoding
        self.positional_encoding = PositionalEncoding(embed_size, dropout=0.1).to(device)
        # Transformer blocks
        self.transformer_blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=4, dropout=0.1)
            for _ in range(num_layers)
        ])

        # Fully connected layer for prediction
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_seq):
        # Embedding input sequence
        embedded_seq = self.embedding(input_seq)
        embedded_seq = self.positional_encoding(embedded_seq)
        # Transformer blocks
        for transformer_block in self.transformer_blocks:
            embedded_seq = transformer_block(embedded_seq)

        # Prediction
        output = self.fc(embedded_seq[-1, :, :])

        return output

# Configuration

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
######################################################################
# Let's now define the parameters of our model and instantiate the same. Below, we also
# define our loss function which is the cross-entropy loss and the optimizer used for training.
#
torch.manual_seed(0)

VOCAB_SIZE = 151#vocab.num_words+1
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128

model = Autoregressive(VOCAB_SIZE, EMB_SIZE, FFN_HID_DIM, NHEAD)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

#model = nn.DataParallel(model)
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [9]:
def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 25,373,847 trainable parameters


# Training

In [12]:
import pickle
def save_to_file(objeto, nome_arquivo):
    with open(nome_arquivo, 'wb') as output:
        pickle.dump(objeto, output, pickle.HIGHEST_PROTOCOL)


def load_file(nome_arquivo):
    with open(nome_arquivo, 'rb') as input:
        objeto = pickle.load(input)
    return objeto

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
            yield lst[i:i + n]

sub_path = './'

vocab = load_file(sub_path+'vocab.pkl')

src = load_file(sub_path+'src.pkl')
trg = load_file(sub_path+'trg.pkl')

srcVal = load_file(sub_path+'srcVal.pkl')
trgVal = load_file(sub_path+'trgVal.pkl')

In [14]:
src.shape, trg.shape, srcVal.shape, trgVal.shape

(torch.Size([60, 1341439]),
 torch.Size([1341439]),
 torch.Size([60, 335448]),
 torch.Size([335448]))

In [16]:
import gc

BATCHSIZE = 128

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
            yield lst[i:i + n]

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, optimizer, criterion, src, trg):
    model.train()
    epoch_loss = 0
    it = 0
    for i in chunks(np.arange(src.shape[1]), BATCHSIZE):
        it += 1
        output = model(
            src[:,i].to(device)
        )
        optimizer.zero_grad()
        loss = criterion(output.view(-1, output.shape[-1]), trg[i].to(device).view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        epoch_loss += loss.item()
        gc.collect()
        torch.cuda.empty_cache()
    return epoch_loss / BATCHSIZE

def evaluate(model, criterion, src, trg):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i in chunks(np.arange(src.shape[1]), BATCHSIZE):
            output = model(
                src[:,i].to(device)
            )
            loss = criterion(output.view(-1, output.shape[-1]), trg[i].to(device).view(-1))
            epoch_loss += loss.item()
            gc.collect()
            torch.cuda.empty_cache()
    return epoch_loss / BATCHSIZE

In [22]:
def decode(model, src, maxlen=30):
    response = []
    with torch.no_grad():
        while True:
            output = model(src.to(device))
            word = output.squeeze().argmax()
            response.append(int(word.cpu().numpy()))
            src = torch.cat((srcVal[:,rnd-1:rnd].cpu(),torch.tensor([[int(word.cpu().numpy())]])),dim=0).to('cpu')
            if word == 2 or len(response) == maxlen:
                break
    return response

In [32]:
rnd = np.random.randint(1,100)
o = decode(model, srcVal[:,rnd-1:rnd].to(device),60)
print(' '.join( vocab.index2word[i] for i in o))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
N_EPOCHS = 1000
CLIP = 1
path = './'
best_valid_loss = float('inf')

try: 
    model.load_state_dict(torch.load(path+'math-bert-model.pt'))
    print('Model Loaded Successfully!')
except:
    print('No model loaded, starting training from scratch.')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    print('Start training',epoch)
    train_loss = train(model, optimizer, criterion, src, trg)
    print('Validating...',epoch)
    valid_loss = evaluate(model, criterion, srcVal, trgVal)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), path+'math-bert-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |    Val. PPL: {math.exp(valid_loss):7.3f}')
    with open(path+"modelTrainingOutput.txt", "a") as textFile:
        textFile.write(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s\n')
        textFile.write(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}\n')
        textFile.write(f'\t Val. Loss: {valid_loss:.3f} |    Val. PPL: {math.exp(valid_loss):7.3f}\n')
    if epoch % 10 == 0:
        with torch.no_grad():
            rnd = np.random.randint(1,100)
            o = decode(model, srcVal[:,rnd-1:rnd].to(device),60)
            query = ' '.join( vocab.index2word[i] for i in srcVal[:,rnd-1:rnd].squeeze().cpu().numpy())
            answer = ' '.join( vocab.index2word[i] for i in o)
        print(f'Testing:\n')
        print(f'\t Query:\n\t'+query+'\n')
        print(f'\t Answer:\n\t'+answer+'\n')
        print(f'\n')
        with open(path+"modelTrainingOutput.txt", "a") as textFile:
            textFile.write(f'Testing:\n')
            textFile.write(f'\t Query:\n\t'+query+'\n')
            textFile.write(f'\t Answer:\n\t'+answer+'\n')
            textFile.write(f'\n')