In [1]:
!pip install transformers
!pip install vncorenlp
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: vncorenlp
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645932 sha256=b232934b403076828cd0bd0d31796f8a525dec0759ca250bcc6276191610c1bd
  Stored in directory: /root/.cache/pip/wheels/5d/d9/b3/41f6c6b1ab758561fd4aab55dc0480b9d7a131c6aaa573a3fa
Successfully built vncorenlp
Installing collected packages: vncorenlp
Successfully installed vncorenlp-1.0.3
--2024-05-13 02:30:21--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com 

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from tqdm import tqdm

In [4]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device, use_cuda


(device(type='cuda'), True)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [6]:
def read_dataset(file_path):
    tokens, labels, index_list  = [], [], []
    cnt = 1
    with open(file_path) as f:
        lines=f.readlines()
        token, label = [], []
        for line in lines:
            line = line.split()
            #print(line)
            if len(line) == 0:
                index_list.append(cnt)
                tokens.append(token)
                labels.append(label)
                token, label = [], []
                cnt += 1
            else:
                token.append(line[0])
                label.append(line[-1])
    data = pd.DataFrame({'id':index_list, 'labels':labels, 'tokens':tokens})
    return data

def get_labels(data):
    NERs = list(data['labels'].values)
    labels_list = []
    for value in NERs:
        labels_list = labels_list + value
    types = list(set(labels_list))
    return types

In [7]:
train_df = read_dataset('train_word.conll')
val_df = read_dataset('dev_word.conll')
print(train_df['tokens'].values.tolist()[0])
print(tokenizer(train_df['tokens'].values.tolist()[0], padding='max_length', max_length=258, truncation=True, is_split_into_words=True))
print(train_df['labels'].values.tolist()[0])

['Đồng_thời', ',', 'bệnh_viện', 'tiếp_tục', 'thực_hiện', 'các', 'biện_pháp', 'phòng_chống', 'dịch_bệnh', 'COVID', '-', '19', 'theo', 'hướng_dẫn', 'của', 'Bộ', 'Y_tế', '.']
{'input_ids': [0, 1248, 4, 757, 194, 112, 9, 717, 2137, 3795, 9089, 6232, 1927, 31, 1195, 63, 1010, 7, 125, 1059, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],

In [8]:
unique_labels = get_labels(train_df)
print(len(unique_labels))
print(unique_labels)

20
['I-ORGANIZATION', 'I-LOCATION', 'I-SYMPTOM_AND_DISEASE', 'O', 'B-PATIENT_ID', 'B-AGE', 'I-DATE', 'B-TRANSPORTATION', 'B-GENDER', 'B-LOCATION', 'B-ORGANIZATION', 'B-SYMPTOM_AND_DISEASE', 'B-DATE', 'I-PATIENT_ID', 'I-TRANSPORTATION', 'B-JOB', 'I-AGE', 'I-JOB', 'I-NAME', 'B-NAME']


In [9]:

labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}
print(labels_to_ids)

{'I-ORGANIZATION': 0, 'I-LOCATION': 1, 'I-SYMPTOM_AND_DISEASE': 2, 'O': 3, 'B-PATIENT_ID': 4, 'B-AGE': 5, 'I-DATE': 6, 'B-TRANSPORTATION': 7, 'B-GENDER': 8, 'B-LOCATION': 9, 'B-ORGANIZATION': 10, 'B-SYMPTOM_AND_DISEASE': 11, 'B-DATE': 12, 'I-PATIENT_ID': 13, 'I-TRANSPORTATION': 14, 'B-JOB': 15, 'I-AGE': 16, 'I-JOB': 17, 'I-NAME': 18, 'B-NAME': 19}


In [10]:
def align_word_ids(text, labels):
    tokenized_inputs = tokenizer(text, padding='max_length', max_length=256, truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.input_ids

    start_part = True
    label_ids = []
    count = 0

    for i in range(len(word_ids)):
        if word_ids[i] == 0 or word_ids[i] == 1 or word_ids[i] == 2:
            label_ids.append(-100)

        elif count < len(text) and ''.join(tokenizer.decode(tokenized_inputs['input_ids'][i]).split()) == text[count]:
            label_ids.append(labels_to_ids[labels[count]])
            count += 1
            start_part = True
        else:
            if start_part:
                label_ids.append(labels_to_ids[labels[count]])
                count += 1
                start_part = False
            else:
                label_ids.append(-100)
    return label_ids

class CustomDataSet(Dataset):
    def __init__(self, df):

        lb = df['labels'].values.tolist()
        txt = df['tokens'].values.tolist()
        self.texts = [tokenizer(i, padding='max_length', max_length = 256,
                                truncation=True, return_tensors="pt", is_split_into_words=True) for i in txt]
        self.labels = [align_word_ids(i,j) for i,j in zip(txt, lb)]


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        data = self.texts[idx]
        labels = torch.LongTensor (self.labels[idx])
        return data, labels

In [11]:
train_dataset = CustomDataSet(train_df)
val_dataset = CustomDataSet(val_df)

In [12]:
print(train_dataset.__getitem__(1))

({'input_ids': tensor([[    0,    22,  1907,   757,    62,  1665,   798,    45,  2122,    84,
             6,   359,  2194,    52,   197,   976,    22,     4,  7886,    10,
           687,     4,  1223,    21,   798,    23,  1016,    33,    37,    11,
           430,    75,     9,  5990,  2498,     4,   122,    21,   798,  1021,
           326, 12299,    85,    17,    10,   521,    11,   782,     5,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1, 

In [13]:
train_dataloader = DataLoader(train_dataset,  batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset,  batch_size=16)

In [14]:
class NERModel(nn.Module):
    def __init__(self):
        super(NERModel, self).__init__()
        self.bert = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base-v2", num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [15]:
def train(model, optimizer, train_loader, epoch):
    total_acc_train = 0
    total_loss_train = 0
    for train_data, train_label in tqdm(train_dataloader):
        train_label = train_label.to(device)
        mask = train_data['attention_mask'].squeeze(1).to(device)
        input_id = train_data['input_ids'].squeeze(1).to(device)

        optimizer.zero_grad()
        loss, logits = model(input_id, mask, train_label)

        for i in range(logits.shape[0]):
            logits_clean = logits[i][train_label[i] != -100]
            label_clean = train_label[i][train_label[i] != -100]

            predictions = logits_clean.argmax(dim=1)
            acc = (predictions == label_clean).float().mean()
            total_acc_train += acc
            total_loss_train += loss.item()

        loss.backward()
        optimizer.step()
    print(f'Train Accuracy: {total_acc_train / len(train_df): .3f} Train Loss: {total_loss_train / len(train_df) : .3f}')

def test(model, val_loader, epoch):
    model.eval()
    total_acc_val = 0
    total_loss_val = 0
    with torch.no_grad():
        for val_data, val_label in val_dataloader:
            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):
                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc
                total_loss_val += loss.item()
    print(f'Val Accuracy: {total_acc_val / len(val_df): .3f} Val Loss: {total_loss_val / len(val_df) : .3f}')



In [16]:
model = NERModel()
model.to(device)
LEARNING_RATE = 5e-5
EPOCHS = 2
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

if use_cuda:
    model = model.cuda()
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}:')
    train(model, optimizer, train_dataloader, epoch)
    test(model, val_dataloader, epoch)

torch.save(model.state_dict(), 'model.pth')

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:


100%|██████████| 315/315 [03:24<00:00,  1.54it/s]


Train Accuracy:  0.944 Train Loss:  0.366
Val Accuracy:  0.983 Val Loss:  0.119
Epoch 2:


100%|██████████| 315/315 [03:25<00:00,  1.53it/s]


Train Accuracy:  0.988 Train Loss:  0.078
Val Accuracy:  0.981 Val Loss:  0.094


In [18]:
def align_word_ids(text, flag):
    label_all_tokens = flag

    tokenized_inputs = tokenizer(text, padding='max_length', max_length=258, truncation=True)

    word_ids = tokenized_inputs.input_ids

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def predict(model, sentence):
    text = tokenizer(sentence, padding='max_length', max_length = 258, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence, False)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(len(text))
    print(prediction_label)
    print(len(prediction_label))

predict(model,'Bệnh_nhân nhập_viện tối_qua ở Bệnh_Viện 115 là bệnh_nhân thứ 82, di_chuyển qua nhiều thành_phố bằng xe biển_hiệu E-402')


Bệnh_nhân nhập_viện tối_qua ở Bệnh_Viện 115 là bệnh_nhân thứ 82, di_chuyển qua nhiều thành_phố bằng xe biển_hiệu E-402
3
['O', 'O', 'O', 'O', 'O', 'B-LOCATION', 'I-LOCATION', 'O', 'O', 'O', 'B-PATIENT_ID', 'B-PATIENT_ID', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TRANSPORTATION', 'I-TRANSPORTATION', 'O', 'O']
24
