In [1]:
skip_training = True  # Set this flag to True to skip training the model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

import pandas as pd

import numpy as np

import time
import random
import functools
import os

Material:  
https://www.vinai.io/phobert-the-first-public-large-scale-language-models-for-vietnamese

# 1. Read the data

https://github.com/datquocnguyen/VnDT#data-split

https://github.com/datquocnguyen/VnDT/blob/master/VnDT-paper-CameraReadyVersion.pdf

## 1.1 Read the tags

In [3]:
import re
def read_1(file_name):
    with open(file_name, encoding='utf8') as f:
        # line = ['Np Proper noun','Nc Classifier noun',...]
        lines = re.split('\n',f.read())
    
    tags = []
    tag_dict = {}
    for line in lines:
        tmp = line.split(' ',1)
        tags.append(tmp[0])
        tag_dict[tmp[0]] = tmp[1]
        
    return tags, tag_dict

## 1.2 Read train - dev - test corpuses

In [4]:
import re
def read_2(file_name):
    """
    Parameters
    ----------
    file_name - string
        a path to a file with an annotated corpus
    
    Returns 
    -------
    words - a list of lists of words
    tags - a list of lists of tags
        For example, the first sentence in a file is word1-tag1, word2-tag2 
        and the next sentence is word3_/_tag3. Then you should get:
        words = [['word1','word2'],['word3']]
        tags = [['tag1','tag2'],['tag3']]
    """
    
    df = pd.read_csv(file_name, sep='\t|\n', names = ['idx','word','c3','c4','tag','c6','c7','c8','c9','c10'])
    
    words = []
    tags = []
    sentence_words = []
    sentence_tags = []
    prev_idx = 0
    for _,row in df.iterrows():
        if row['idx'] != prev_idx+1:
            words.append(sentence_words)
            tags.append(sentence_tags)
            sentence_words = []
            sentence_tags = []
            
        sentence_words.append(row['word'])
        sentence_tags.append(row['tag'])
        prev_idx = row['idx']
            
    return words, tags

## 1.3 Load the data

In [5]:
cwd = os.getcwd()
cwd

'/m/home/home1/12/dangp1/unix/POS_NER'

In [6]:
tags_vocab_path = cwd+'/data/tags_vocab.txt'
train_path = cwd+'/data/train.txt'
valid_path = cwd+'/data/dev.txt'
test_path = cwd+'/data/test.txt'

tags_vocab, tags_vocab_dict = read_1(tags_vocab_path)
train_words, train_tags = read_2(train_path)
valid_words, valid_tags = read_2(valid_path)
test_words, test_tags = read_2(test_path)

print(len(train_words))
print(len(valid_words))
print(len(test_words))

  return func(*args, **kwargs)


8976
199
1019


## 1.4 Enumerate labels

In [7]:
def tag2num(tags_vocab):
    return dict(zip( tags_vocab, range(1, len(tags_vocab)+1) ))

tag2num = tag2num(tags_vocab)

In [8]:
def Enumerate_tags(sentence_tags, tag2num):
    converted_tags = [ list(map(lambda x: tag2num[x], sentence_tag)) for sentence_tag in sentence_tags ]
    return converted_tags

enumerated_train_tags = Enumerate_tags(train_tags, tag2num)
enumerated_valid_tags = Enumerate_tags(valid_tags, tag2num)
enumerated_test_tags = Enumerate_tags(test_tags, tag2num)

In [9]:
PAD_IDX = 0

# 2. Study the data

# 3. Encode

In [10]:
class Encode(Dataset):
    def __init__(self, words, tags, tags_vocab, tokenizer, max_len, pad_idx):
        self.data = words
        self.labels = tags
        self.tags_vocab = tags_vocab
        self.tokenizer = tokenizer
        self.len = len(self.data)
        self.max_len = max_len
        self.pad_idx = pad_idx
        
    def __getitem__(self, index):
        inputs = self.tokenizer.encode_plus(
            self.data[index],
            None,
            add_special_tokens=False,
            max_length=self.max_len,
            padding = 'max_length',
            truncation=True
        )
        
        ids = inputs['input_ids']
        
        labels = self.labels[index] 
        labels += [self.pad_idx]*(self.max_len-len(labels))      # pad to the right
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.long)
        }
    
    def __len__(self):
        return self.len

In [11]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

In [12]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
training_set = Encode(train_words, enumerated_train_tags, tags_vocab, tokenizer, MAX_LEN, PAD_IDX)
validating_set = Encode(valid_words, enumerated_valid_tags, tags_vocab, tokenizer, MAX_LEN, PAD_IDX)
testing_set = Encode(test_words, enumerated_test_tags, tags_vocab, tokenizer, MAX_LEN, PAD_IDX)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **valid_params)
testing_loader = DataLoader(testing_set, **test_params)

# 4. Building the model

In [14]:
#https://github.com/VinAIResearch/PhoBERT
phobert = AutoModel.from_pretrained("vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [16]:
#https://github.com/bentrevett/pytorch-pos-tagging/blob/master/2_transformer.ipynb
class PhoBERTPoSTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dim, 
                 dropout):
        
        super(PhoBERTPoSTagger, self).__init__()
        self.bert = bert
        for params in self.bert.parameters():
            params.requires_grad =  True
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.fc1 = nn.Linear(embedding_dim, 512)
        self.fc2 = nn.Linear(512,256 )
        self.fc = nn.Linear(256, output_dim)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, ids):
  
        # ids = (batch_size, max_len)
    
        outputs = self.bert(
            input_ids=ids,
        )
        
        # outputs = BERT return
    
        outputs = outputs[0]
        
        # outputs = (batch_size, max_len, hidden_size)
        
        outputs = F.relu(self.fc1(self.dropout1(outputs)))
        outputs =  F.relu(self.fc2(self.dropout2(outputs)))
        outputs =  self.fc(self.dropout(outputs))
        
        # outputs = (batch_size, max_len, output_dim)
        
        outputs = outputs.permute(1,0,2)
        
        # outputs = (max_len, batch_size, output_dim)
        
        return outputs

In [17]:
output_dim = len(tags_vocab)+1
dropout = 0.25

model = PhoBERTPoSTagger(phobert,
                      output_dim, 
                      dropout)

# 5. Train the model

In [18]:
from torch.nn import CrossEntropyLoss

In [19]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [20]:
def compute_accuracy(preds, labels, pad_idx):
    max_pred_idx = preds.argmax(dim=1, keepdim = True)
    tags_idx = (labels != pad_idx).nonzero()
    correct = max_pred_idx[tags_idx].squeeze(1).eq(labels[tags_idx])
    return correct.sum() / torch.FloatTensor([labels[tags_idx].shape[0]]).to(device)

In [21]:
def train(model, training_loader, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
        
    model.train()
    
    for _,batch in enumerate(training_loader):
        optimizer.zero_grad()
        #model.zero_grad()
        
                
        input_ids = batch['ids'].to(device)
        labels = batch['labels'].to(device)
                
        outputs = model(input_ids)
                
        # outputs = (max_len, batch_size, output_dim)
        # labels = (batch_size, max_len)
                
        labels = labels.permute(1,0)
        # labels = (max_len, batch_size)
                
        outputs = outputs.contiguous().view(-1, outputs.shape[-1])
                
        labels = labels.contiguous().view(-1)
                
        # outputs = (max_len*batch_size, output_dim)
        # labels = (max_len*batch_size)
                
        # Note: Seperating words into sentences is not necessary anymore, 
        #       we only care if an output word matches its label
                                
        loss = criterion(outputs, labels)
        acc = compute_accuracy(outputs, labels, tag_pad_idx)
                
        loss.backward()
        optimizer.step()
                
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(training_loader), epoch_acc / len(training_loader)


In [22]:
def evaluate(model, validating_loader, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
        
    model.eval()
    
    with torch.no_grad():
        for _,batch in enumerate(validating_loader):
            input_ids = batch['ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids)

            # outputs = (max_len, batch_size, output_dim)
            # labels = (batch_size, max_len)

            labels = labels.permute(1,0)
            # labels = (max_len, batch_size)

            outputs = outputs.contiguous().view(-1, outputs.shape[-1])

            labels = labels.contiguous().view(-1)

            # outputs = (max_len*batch_size, output_dim)
            # labels = (max_len*batch_size)

            # Note: Seperating words into sentences is not necessary anymore, 
            #       we only care if an output word matches its label

            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs, labels, 0)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(validating_loader), epoch_acc / len(validating_loader)


In [23]:
criterion = CrossEntropyLoss(ignore_index = PAD_IDX)
N_EPOCHS = 40
LEARNING_RATE = 1e-05
optimizer = optim.AdamW(model.parameters(), lr = LEARNING_RATE)

model = model.to(device)
criterion = criterion.to(device)

In [24]:
if not skip_training:
    best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch+1, N_EPOCHS))

        start_time = time.time()

        train_loss, train_acc = train(model, training_loader, optimizer, criterion, PAD_IDX)
        valid_loss, valid_acc = evaluate(model, validating_loader, criterion, PAD_IDX)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'postag-model.pt')

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [25]:
model.load_state_dict(torch.load('postag-model.pt'))

test_loss, test_acc = evaluate(model, testing_loader, criterion, PAD_IDX)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.206 | Test Acc: 94.86%


# NER it

In [26]:
import re
def read_3(file_name):
    """
    Parameters
    ----------
    file_name - string
        a path to a file with an annotated corpus
    
    Returns 
    -------
    words - a list of lists of words
    tags - a list of lists of tags
        For example, the first sentence in a file is word1-tag1, word2-tag2 
        and the next sentence is word3_/_tag3. Then you should get:
        words = [['word1','word2'],['word3']]
        tags = [['tag1','tag2'],['tag3']]
    """
    
    with open(file_name) as f:
        txt = f.read().split('\n')
    words = []
    tags = []
    sentence_words = []
    sentence_tags = []
    for row in txt:
        # row: word, pos, chunking, ner
        if not row:
            words.append(sentence_words)
            tags.append(sentence_tags)
            sentence_words = []
            sentence_tags = []
            
        if row:
            word, _, _, ner, _ = row.split('\t')
            if word:
                sentence_words.append(word)
                sentence_tags.append(ner)
    
    return words, tags

In [27]:
ners_vocab_path = cwd+'/data/ner/ners_vocab.txt'
train_path = cwd+'/data/ner/train.txt'
valid_path = cwd+'/data/ner/dev.txt'
test_path = cwd+'/data/ner/test.txt'

ners_vocab, ners_vocab_dict = read_1(ners_vocab_path)
train_words, train_tags = read_3(train_path)
valid_words, valid_tags = read_3(valid_path)
test_words, test_tags = read_3(test_path)

print(len(train_words))
print(len(valid_words))
print(len(test_words))

14027
2831
2831


In [28]:
def tag2num(tags_vocab):
    return dict(zip( tags_vocab, range(1, len(tags_vocab)+1) ))

tag2num = tag2num(ners_vocab)
num2tag = {v: k for k, v in tag2num.items()}

In [29]:
def Enumerate_tags(sentence_tags, tag2num):
    converted_tags = [ list(map(lambda x: tag2num[x], sentence_tag)) for sentence_tag in sentence_tags ]
    return converted_tags

enumerated_train_tags = Enumerate_tags(train_tags, tag2num)
enumerated_valid_tags = Enumerate_tags(valid_tags, tag2num)
enumerated_test_tags = Enumerate_tags(test_tags, tag2num)

In [30]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

In [31]:
training_set = Encode(train_words, enumerated_train_tags, ners_vocab, tokenizer, MAX_LEN, PAD_IDX)
validating_set = Encode(valid_words, enumerated_valid_tags, ners_vocab, tokenizer, MAX_LEN, PAD_IDX)
testing_set = Encode(test_words, enumerated_test_tags, ners_vocab, tokenizer, MAX_LEN, PAD_IDX)

In [32]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **valid_params)
testing_loader = DataLoader(testing_set, **test_params)

Precision is the percentage of named entities found by the learning system that are correct. Recall is the percentage of named entities present in the corpus that are found by the system. A named entity is correct only if it is an exact match of the corresponding entity in the data file.

In [33]:
from seqeval.metrics import f1_score

In [34]:
def compute_f1(preds, labels, pad_idx):
    # Get predicted label by taking the one with largest prob
    max_pred_idx = preds.argmax(dim=2)
    # Get indices of labels that is not pad_idx
    tags_idx = (labels != pad_idx).nonzero()
    
    y_true_tags = []
    y_pred_tags = []
    for i_sentence in range(len(labels)):
        label = labels[i_sentence]
        pred = max_pred_idx[i_sentence]
        tags_idx = (label != pad_idx).nonzero()
        print(pred)
        label = label[tags_idx].flatten()
        pred = pred[tags_idx].flatten()
        print(pred)
        
        tags_idx = (pred != pad_idx).nonzero()
        y_true_num = label[tags_idx].flatten().detach().cpu().tolist()
        y_pred_num = pred[tags_idx].flatten().detach().cpu().tolist()
        print('label:', y_true_num)
        print('pred:', y_pred_num)
        y_true_tag = [num2tag[num] for num in y_true_num]
        y_pred_tag = [num2tag[num] for num in y_pred_num]
        y_true_tags.append(y_true_tag)
        y_pred_tags.append(y_pred_tag)
    
    #print(y_true_tags)
    #print(y_pred_tags)
    
    return f1_score(y_true_tags, y_pred_tags)

In [35]:
preds = torch.tensor([[[0.4,0.5,0.8], [0.9,0.2,0.3]],[[0.5,0.3,0.2],[0.1,0.8,0.1]]])
labels = torch.tensor([[1,2],[1,2]])
compute_f1(preds, labels, 0)

tensor([2, 0])
tensor([2, 0])
label: [1]
pred: [2]
tensor([0, 1])
tensor([0, 1])
label: [2]
pred: [1]


1.0

In [36]:
output_dim = len(ners_vocab)+1
dropout = 0.25

model = PhoBERTPoSTagger(phobert,
                      output_dim, 
                      dropout)

In [37]:
def train(model, training_loader, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
        
    model.train()
    
    for _,batch in enumerate(training_loader):
        optimizer.zero_grad()
        #model.zero_grad()
        
                
        input_ids = batch['ids'].to(device)
        labels = batch['labels'].to(device)
                
        outputs = model(input_ids)
        f1 = compute_f1(outputs, labels, 0)
                
        # outputs = (max_len, batch_size, output_dim)
        # labels = (batch_size, max_len)
                
        labels = labels.permute(1,0)
        # labels = (max_len, batch_size)
                
        outputs = outputs.contiguous().view(-1, outputs.shape[-1])
                
        labels = labels.contiguous().view(-1)
                
        # outputs = (max_len*batch_size, output_dim)
        # labels = (max_len*batch_size)
                
        # Note: Seperating words into sentences is not necessary anymore, 
        #       we only care if an output word matches its label
                                
        loss = criterion(outputs, labels)
        acc = compute_accuracy(outputs, labels, tag_pad_idx)
        
                
        loss.backward()
        optimizer.step()
                
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1
        
    return epoch_loss / len(training_loader), epoch_acc / len(training_loader), epoch_f1 / len(training_loader)


In [38]:
def evaluate(model, validating_loader, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
        
    model.eval()
    
    with torch.no_grad():
        for _,batch in enumerate(validating_loader):
            input_ids = batch['ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids)
            f1 = compute_f1(outputs.copy(), labels.copy(), 0)

            # outputs = (max_len, batch_size, output_dim)
            # labels = (batch_size, max_len)

            labels = labels.permute(1,0)
            # labels = (max_len, batch_size)

            outputs = outputs.contiguous().view(-1, outputs.shape[-1])

            labels = labels.contiguous().view(-1)

            # outputs = (max_len*batch_size, output_dim)
            # labels = (max_len*batch_size)

            # Note: Seperating words into sentences is not necessary anymore, 
            #       we only care if an output word matches its label

            loss = criterion(outputs, labels)
            acc = compute_accuracy(outputs, labels, 0)
            

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1
        
    return epoch_loss / len(training_loader), epoch_acc / len(training_loader), epoch_f1 / len(training_loader)


In [39]:
skip_training = False

In [40]:
criterion = CrossEntropyLoss(ignore_index = PAD_IDX)
N_EPOCHS = 40
LEARNING_RATE = 1e-05
optimizer = optim.AdamW(model.parameters(), lr = LEARNING_RATE)

model = model.to(device)
criterion = criterion.to(device)

In [41]:
if not skip_training:
    best_valid_f1 = -float('inf')
    for epoch in range(N_EPOCHS):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch+1, N_EPOCHS))

        start_time = time.time()

        train_loss, train_acc, train_f1 = train(model, training_loader, optimizer, criterion, PAD_IDX)
        valid_loss, valid_acc, valid_f1 = evaluate(model, validating_loader, criterion, PAD_IDX)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_f1 > best_valid_f1:
            best_valid_f1 = valid_f1
            torch.save(model.state_dict(), 'ner-model.pt')

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train F1: {train_f1:.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. F1: {eval_f1:.3f}')


tensor([3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 4, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 0, 3, 3, 1, 3], device='cuda:0')
tensor([3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 4, 3, 3, 3, 1, 3, 3, 3, 3, 3],
       device='cuda:0')
label: [9, 9, 9, 9, 9, 9, 9, 9, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
pred: [3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 4, 3, 3, 3, 1, 3, 3, 3, 3, 3]
tensor([1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 4, 3, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3,
        5, 3, 1, 1, 3, 3, 1, 3], device='cuda:0')
tensor([1, 3, 3, 3], device='cuda:0')
label: [9, 9, 9, 9]
pred: [1, 3, 3, 3]
tensor([3, 3, 1, 3, 3, 4, 3, 1, 3, 0, 3, 3, 1, 3, 5, 3, 3, 3, 3, 8, 8, 1, 3, 3,
        7, 3, 4, 3, 3, 7, 3, 3], device='cuda:0')
tensor([3, 3, 1, 3, 3, 4, 3, 1, 3, 0, 3, 3, 1, 3, 5, 3, 3, 3, 3],
       device='cuda:0')
label: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 9, 9, 9, 9, 9, 9, 9]
pred: [3, 3, 1, 3, 3, 4, 3, 1, 3, 3, 3, 1, 3, 5, 3, 3, 3, 3]
tensor([3, 3, 5, 3, 3, 4, 3, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [0,0,0], thread: [32,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [0,0,0], thread: [33,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [0,0,0], thread: [34,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [0,0,0], thread: [35,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [0,0,0], thread: [36,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:93: operator(): block: [0,0,0], thr

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
model.load_state_dict(torch.load('ner-model.pt'))

test_loss, test_acc, test_f1 = evaluate(model, testing_loader, criterion, PAD_IDX)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test F1: {test_f1:.3f}'')