In [None]:
# ! pip install torchtext==0.6.0 pyvi spacy https://gitlab.com/trungtv/vi_spacy/-/raw/master/vi_core_news_lg/dist/vi_core_news_lg-0.0.1.tar.gz rouge_score
# ! python -m spacy download en_core_web_sm

In [7]:
# ! git clone https://github.com/namnh194/transformer_pointer_generator_network.git

Cloning into 'transformer_pointer_generator_network'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 37 (delta 9), reused 32 (delta 7), pack-reused 0[K
Receiving objects: 100% (37/37), 29.77 KiB | 677.00 KiB/s, done.
Resolving deltas: 100% (9/9), done.


In [12]:
# refer: github.com/pbcquoc
from dataset import read_data, create_fields, create_dataset
from model import Transformer
from train_utils import ScheduledOptim, LabelSmoothingLoss, step, validiate
from inference import rouge_score, translate_sentence
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import datasets

dataset = datasets.load_dataset("nam194/vietnews")
dataset

DatasetDict({
    test: Dataset({
        features: ['guid', 'title', 'abstract', 'article'],
        num_rows: 22498
    })
    train: Dataset({
        features: ['guid', 'title', 'abstract', 'article'],
        num_rows: 99134
    })
    validation: Dataset({
        features: ['guid', 'title', 'abstract', 'article'],
        num_rows: 22184
    })
})

In [13]:
opt = {
    'src_lang': 'vi_core_news_lg',
    'trg_lang': 'vi_core_news_lg',#'vi_spacy_model',
    'en_max_strlen': 256,
    'de_max_strlen': 64,
    'batchsize': 32,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'd_model': 512,
    'n_layers': 6,
    'heads': 8,
    'dropout': 0.1,
    'lr': 0.0001,
    'epochs': 20,
    'printevery': 200,
    'k': 5,
}

In [9]:
train_src_data, train_trg_data = read_data(dataset, 'train', 'article', 'abstract')
valid_src_data, valid_trg_data = read_data(dataset, 'validation', 'article', 'abstract')

SRC, TRG = create_fields(opt['src_lang'], opt['trg_lang'])
train_iter = create_dataset(train_src_data, train_trg_data, opt['en_max_strlen'], opt['de_max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=True)
valid_iter = create_dataset(valid_src_data, valid_trg_data, opt['en_max_strlen'], opt['de_max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=False)

loading spacy tokenizers...
creating dataset and iterator... 
creating dataset and iterator... 


In [10]:
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']

In [14]:
en_config = {
    "vocab_size": len(SRC.vocab),
    "max_seq_len": opt['en_max_strlen'],
    "d_model": opt['d_model'],
    "n_heads": opt['heads'],
    "d_ff": 2048,
    "num_layer": opt['n_layers'],
    "dropout": opt['dropout']}
de_config = {
    "vocab_size": len(TRG.vocab),
    "max_seq_len": opt['de_max_strlen'],
    "d_model": opt['d_model'],
    "n_heads": opt['heads'],
    "d_ff": 2048,
    "num_layer": opt['n_layers'],
    "dropout": opt['dropout']}

batch_size = opt['batchsize']
en_seq_len = en_config["max_seq_len"]
de_seq_len = de_config["max_seq_len"]
en_vocab_size = en_config["vocab_size"]
de_vocab_size = de_config["vocab_size"]

model = Transformer(en_config, de_config)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
model = model.to(opt['device'])

271265810


In [67]:
import gc
from accelerate import Accelerator

optimizer = ScheduledOptim(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        0.2, opt['d_model'], 4000)
criterion = LabelSmoothingLoss(len(TRG.vocab), padding_idx=trg_pad, smoothing=0.1)

model, optimizer, train_iter, valid_iter = Accelerator().prepare(
    model, optimizer, train_iter, valid_iter)

torch.cuda.empty_cache()
gc.collect()

In [69]:
!wandb login e67e803979133f932a24dac4dedb24348671c1ba

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [71]:
import wandb
wandb.init(
    # set the wandb project where this run will be logged
    project="transformer_VNDS",
    
    # track hyperparameters and run metadata
    config=opt
)

[34m[1mwandb[0m: Currently logged in as: [33mnhnam194[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [79]:
import time

for epoch in range(opt['epochs']):
    total_loss = 0
    
    for i, batch in enumerate(train_iter): 
        s = time.time()
        loss = step(model, optimizer, batch, criterion, src_pad, trg_pad)
        
        total_loss += loss
        
        if (i + 1) % opt['printevery'] == 0:
            avg_loss = total_loss/opt['printevery']
            wandb.log({"train_loss": avg_loss, "step": (i+1)+epoch*3097})
            print('epoch: {:03d} - iter: {:05d} - train loss: {:.4f} - time: {:.4f}'.format(epoch, i, avg_loss, time.time()- s))
            total_loss = 0
            
    s = time.time()
    valid_loss = validiate(model, valid_iter, criterion, src_pad, trg_pad)
    score = rouge_score(valid_src_data[:500], valid_trg_data[:500], model, SRC, TRG, opt['device'], opt['k'], opt['max_strlen'])
    print('epoch: {:03d} - iter: {:05d} - valid loss: {:.4f} - bleu score: {:.4f} - time: {:.4f}'.format(epoch, i, valid_loss, score, time.time() - s))