In [None]:
# ! pip install torchtext==0.6.0 pyvi spacy https://gitlab.com/trungtv/vi_spacy/-/raw/master/vi_core_news_lg/dist/vi_core_news_lg-0.0.1.tar.gz rouge_score
# ! python -m spacy download en_core_web_sm

In [None]:
# ! git clone https://github.com/namnh194/transformer_pointer_generator_network.git

In [1]:
# refer: github.com/pbcquoc
from dataset import read_data, create_fields, create_dataset
from model import Transformer
from train_utils import ScheduledOptim, LabelSmoothingLoss, step, validiate
from inference import rouge_score, translate_sentence
import torch, tqdm
import torch.nn as nn
import torch.nn.functional as F

  rouge = datasets.load_metric("rouge")


In [2]:
import datasets

dataset = datasets.load_dataset("nam194/vietnews")
dataset

DatasetDict({
    test: Dataset({
        features: ['guid', 'title', 'abstract', 'article'],
        num_rows: 22498
    })
    train: Dataset({
        features: ['guid', 'title', 'abstract', 'article'],
        num_rows: 99134
    })
    validation: Dataset({
        features: ['guid', 'title', 'abstract', 'article'],
        num_rows: 22184
    })
})

In [6]:
opt = {
    'lang': 'vi_core_news_lg',#'vi_spacy_model',
    'en_max_strlen': 256,
    'de_max_strlen': 64,
    'batchsize': 64,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'd_model': 512,
    'n_layers': 6,
    'heads': 8,
    'dropout': 0.1,
    'lr': 0.0001,
    'epochs': 20,
    'printevery': 200,
    'k': 5,
}

In [None]:
train_src_data, train_trg_data = read_data(dataset, 'train', 'article', 'abstract')
valid_src_data, valid_trg_data = read_data(dataset, 'validation', 'article', 'abstract')

SRC, TRG = create_fields(opt['src_lang'], opt['trg_lang'], opt['en_max_strlen'], opt['de_max_strlen'])
train_iter = create_dataset(train_src_data, train_trg_data, opt['batchsize'], opt['device'], SRC, TRG, istrain=True)
valid_iter = create_dataset(valid_src_data, valid_trg_data, opt['batchsize'], opt['device'], SRC, TRG, istrain=False)

In [None]:
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']
src_pad, trg_pad

In [None]:
SRC.preprocess('t√¥i kh√¥ng c√≤n l√† sinh vi√™n     ƒë·∫°i   h·ªçc  BKHN')
# SRC.vocab.stoi['sinh_vi√™n']

In [None]:
from train_utils import create_masks

src = batch.src.transpose(0, 1)
trg = batch.trg.transpose(0, 1)
trg_input = trg[:, :-1]
src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad, device='cpu')
# preds = model(src, trg_input, src_mask, trg_mask)
print(src.shape, trg.shape, trg_input.shape)
print(src_mask.shape, trg_mask.shape, src_mask, trg_mask)

In [7]:
en_config = {
    "max_seq_len": opt['en_max_strlen'],
    "d_model": opt['d_model'],
    "n_heads": opt['heads'],
    "d_ff": 2048,
    "num_layer": opt['n_layers'],
    "dropout": opt['dropout']}
de_config = {
    "max_seq_len": opt['de_max_strlen'],
    "d_model": opt['d_model'],
    "n_heads": opt['heads'],
    "d_ff": 2048,
    "num_layer": opt['n_layers'],
    "dropout": opt['dropout']}

vocab_size = 100 # len(tokenizer.vocab)
batch_size = opt['batchsize']
en_seq_len = en_config["max_seq_len"]
de_seq_len = de_config["max_seq_len"]

model = Transformer(vocab_size, en_config, de_config)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
model = model.to(opt['device'])

44243044


In [None]:
prob_map = model(src_sent=torch.LongTensor(batch_size, en_config['max_seq_len']).random_(0, vocab_size), \
               tgt_sent=torch.LongTensor(batch_size, de_config['max_seq_len']).random_(0, vocab_size), \
               src_mask=torch.randn(batch_size, 1, en_config['max_seq_len']), \
               tgt_mask=torch.randn(batch_size, 1, de_config['max_seq_len']))
print(prob_map.shape)

In [None]:
import gc
from accelerate import Accelerator

optimizer = ScheduledOptim(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        0.2, opt['d_model'], 4000)
criterion = LabelSmoothingLoss(len(TRG.vocab), padding_idx=trg_pad, smoothing=0.1)

model, optimizer, train_iter, valid_iter = Accelerator().prepare(
    model, optimizer, train_iter, valid_iter)

torch.cuda.empty_cache()
gc.collect()

In [None]:
!wandb login e67e803979133f932a24dac4dedb24348671c1ba

In [None]:
import wandb
wandb.init(
    # set the wandb project where this run will be logged
    project="transformer_VNDS",
    
    # track hyperparameters and run metadata
    config=opt
)

In [None]:
import time

for epoch in range(opt['epochs']):
    total_loss = 0
    
    for i, batch in enumerate(train_iter): 
        s = time.time()
        loss = step(model, optimizer, batch, criterion, src_pad, trg_pad)
        
        total_loss += loss
        
        if (i + 1) % opt['printevery'] == 0:
            avg_loss = total_loss/opt['printevery']
            wandb.log({"train_loss": avg_loss, "step": (i+1)+epoch*3097})
            print('epoch: {:03d} - iter: {:05d} - train loss: {:.4f} - time: {:.4f}'.format(epoch, i, avg_loss, time.time()- s))
            total_loss = 0
            
    s = time.time()
    valid_loss = validiate(model, valid_iter, criterion, src_pad, trg_pad)
    score = rouge_score(valid_src_data[:500], valid_trg_data[:500], model, SRC, TRG, opt['device'], opt['k'], opt['max_strlen'])
    print('epoch: {:03d} - iter: {:05d} - valid loss: {:.4f} - bleu score: {:.4f} - time: {:.4f}'.format(epoch, i, valid_loss, score, time.time() - s))

In [139]:
batch = next(iter(valid_iter)).src[:,0]
print(batch.shape)
sentence = []

for i in batch:
    if i not in [0,1]:
        sentence += [SRC.vocab.itos[i]]
sentence = ' '.join(sentence).replace('_',' ')
print(sentence)

trans_sent = translate_sentence(sentence, model, SRC, TRG, opt['device'], opt['k'], opt['de_max_strlen'])
trans_sent

torch.Size([256])
Khu√¥n vi√™n ƒê·∫°i h·ªçc Stanford , bang California , M·ªπ . C√°c ngu·ªìn tin gi·∫•u t√™n h√¥m qua ti·∫øt l·ªô m·ªôt gia ƒë√¨nh Trung Qu·ªëc ƒë√£ tr·∫£ 6,5 tri·ªáu USD cho William Rick Singer , ch·ªß s·ªü h·ªØu c√¥ng ty t∆∞ v·∫•n ƒë√†o t·∫°o d·ª± b·ªã ƒë·∫°i h·ªçc Edge College & Career Network , ƒë·ªÉ ƒë·∫£m b·∫£o cho con g√°i c·ªßa h·ªç c√≥ m·ªôt su·∫•t t·∫°i ƒê·∫°i h·ªçc Stanford , M·ªπ . ƒê√¢y ƒë∆∞·ª£c cho l√† kho·∫£n ti·ªÅn l·ªõn nh·∫•t m√† c√°c ph·ª• huynh tr·∫£ cho Singer , k·∫ª c·∫ßm ƒë·∫ßu ƒë∆∞·ªùng d√¢y ch·∫°y v√†o c√°c tr∆∞·ªùng ƒë·∫°i h·ªçc danh gi√° c·ªßa M·ªπ . Sau Chi·∫øn d·ªãch Versity Blues k√©o d√†i m·ªôt nƒÉm , c√°c c√¥ng t·ªë vi√™n li√™n bang M·ªπ h·ªìi th√°ng 3 c√¥ng b·ªë c√°ch th·ª©c ho·∫°t ƒë·ªông c·ªßa ƒë∆∞·ªùng d√¢y n√†y , ƒë·ªìng th·ªùi n·ªôp h·ªì s∆° truy t·ªë l√™n B·ªô T∆∞ ph√°p . Truy·ªÅn th√¥ng M·ªπ x√°c ƒë·ªãnh n·ªØ sinh Trung Qu·ªëc ƒë∆∞·ª£c b·ªë m·∫π chi ti·ªÅn ch·∫°y v√†o ƒê·∫°i h·ªçc Stanford c√≥ t√™n l√† Yusi " Molly " Zhao . Gia ƒë

'ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi sinh_th·ªùi ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ Ph√∫c_C∆∞·ªùng sinh_th·ªùi Ph√∫c_C∆∞·ªùng sinh_th·ªùi ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ sinh_th·ªùi Ph√∫c_C∆∞·ªùng Ph√∫c_C∆∞·ªùng Ph√∫c_C∆∞·ªùng V∆∞∆°ng_Ch√≠_D√¢n ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞ ch·ª©ng_th∆∞'

0 torch.Size([256, 11])


In [148]:
for i, batch in enumerate(train_iter):
    if batch.src.shape[-1] != 1:
        print(i, batch.src.shape)
        break

for idx in range(batch.src.shape[-1]):
    print('idx: ',idx)
    sentence = []
    for i in batch.src[:,idx]:
        if i not in [0,1]:
            sentence += [SRC.vocab.itos[i].replace('_',' ')]
    print(' '.join(sentence))
    print('---------------------------------------------------')

0 torch.Size([256, 2])
idx:  0
Ng√†y 29 - 11 , trao ƒë·ªïi v·ªõi Tu·ªïi Tr·∫ª v·ªÅ vi·ªác S·ªü T√†i nguy√™n - m√¥i tr∆∞·ªùng H√† N·ªôi l·∫•y √Ω ki·∫øn g√≥p √Ω c·ªßa c√°c s·ªü ng√†nh , qu·∫≠n huy·ªán v·ªÅ ph∆∞∆°ng √°n b·∫£o v·ªá kho√°ng s·∫£n ch∆∞a khai th√°c , √¥ng Nguy·ªÖn Minh M∆∞·ªùi , ph√≥ gi√°m ƒë·ªëc S·ªü T√†i nguy√™n - m√¥i tr∆∞·ªùng H√† N·ªôi kh·∫≥ng ƒë·ªãnh ch·ªß tr∆∞∆°ng chung l√† kh√¥ng khuy·∫øn kh√≠ch khai th√°c kho√°ng s·∫£n . Theo ph∆∞∆°ng √°n b·∫£o v·ªá kho√°ng s·∫£n ch∆∞a khai th√°c , S·ªü T√†i nguy√™n - m√¥i tr∆∞·ªùng H√† N·ªôi cho bi·∫øt tr√™n ƒë·ªãa b√†n th√†nh ph·ªë c√≥ m·ªôt s·ªë lo·∫°i kho√°ng s·∫£n ch√≠nh v√† c√≥ tri·ªÉn v·ªçng khai th√°c l√† ƒë√° x√¢y d·ª±ng , c√°t x√¢y d·ª±ng v√† c√°t san l·∫•p , s√©t g·∫°ch ng√≥i , than b√πn v√† puzolan . ƒê√° x√¢y d·ª±ng ƒë∆∞·ª£c ph√¢n b·ªë d·ªçc theo ranh gi·ªõi ph√≠a t√¢y , ti·∫øp gi√°p v·ªõi Ho√† B√¨nh t·ª´ N√∫i Ch·∫π , ƒê√° Ch√¥ng ( huy·ªán Ba V√¨ ) qua Ph√∫ M√£n , N√∫i Voi ( huy·ªán Qu·ªëc Oai ) ƒë·∫øn M·ªπ ƒê·ª©c v√† m·