## Data process

In [1]:
# read conllu file

from conllu import parse , parse_incr
def load_conllu(path):
    """ return list tuple(word ,upos_tag) """
    result = []
    with open(path , 'r' , encoding='utf-8') as f:
        # data = f.read()
        for obj in parse_incr(f):
            list_sentence = []
            for token in obj:
                if isinstance(token['id'], int):
                    word = token['form']
                    upos_tag = token['upos']
                    list_sentence.append((word , upos_tag))
            result.append(list_sentence)
    return result


In [2]:
data_file = r'/home/manh/code/nlp/src/data/UD_English-EWT/en_ewt-ud-train.conllu'
data = load_conllu(data_file)

tạo từ điển cho words và tag

In [3]:
def dic(data):
    # 1. Khai báo PAD là 0, UNK là 1 ngay từ đầu
    word_to_idx = {'<PAD>': 0, '<UNK>': 1}
    tag_to_idx = {'<PAD>': 0}

    for sentence in data:
        for word in sentence:
            token = word[0] # Từ
            tag = word[1]   # Nhãn

            # Logic tăng index chuẩn: Lấy độ dài hiện tại làm index mới
            if token not in word_to_idx:
                word_to_idx[token] = len(word_to_idx)

            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)

    return word_to_idx, tag_to_idx

In [4]:
word_idx , tag_idx = dic(data)
print(f"len word dict : {len(word_idx)} , len tag dict : {len(tag_idx)}")

len word dict : 19675 , len tag dict : 18


In [28]:
tag_idx

{'<PAD>': 0,
 'PROPN': 1,
 'PUNCT': 2,
 'ADJ': 3,
 'NOUN': 4,
 'VERB': 5,
 'DET': 6,
 'ADP': 7,
 'AUX': 8,
 'PRON': 9,
 'PART': 10,
 'SCONJ': 11,
 'NUM': 12,
 'ADV': 13,
 'CCONJ': 14,
 'X': 15,
 'INTJ': 16,
 'SYM': 17}

train loader , dev loader

In [5]:
def transform_data(data, word_idx, tag_idx):
    X_data = []
    Y_data = []

    unk_idx = word_idx.get('<UNK>', 1) # Lấy index UNK an toàn

    for sentence in data:
        x_idx = []
        y_idx = []
        for token in sentence:
            # Xử lý từ (Input)
            word = token[0]
            if word in word_idx:
                x_idx.append(word_idx[word])
            else:
                x_idx.append(unk_idx)

            # Xử lý nhãn (Label) - Nhãn trong train phải luôn tồn tại
            tag = token[1]
            # Nếu lỡ có tag lạ trong dev set thì cho về <PAD> hoặc 1 tag mặc định
            if tag in tag_idx:
                y_idx.append(tag_idx[tag])
            else:
                # Trường hợp hiếm: tag lạ -> bỏ qua hoặc gán đại
                y_idx.append(0)

        X_data.append(x_idx)
        Y_data.append(y_idx)

    return X_data, Y_data

dataset class

In [6]:
from torch.utils.data import DataLoader , Dataset
import torch
class POSDataset(Dataset):
    def __init__(self, X_data , Y_data):
        self.X_data = X_data
        self.Y_data = Y_data
    def __len__(self):
        return len(self.X_data)
    def __getitem__(self, idx):
        return self.X_data[idx], self.Y_data[idx] , len(self.X_data[idx])


padding setence 

In [7]:
import torch
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # 1. Chuyển list thường thành Tensor (để sửa lỗi TypeError ban nãy)
    # Lưu ý: Nếu Dataset của bạn đã trả về Tensor rồi thì bước này vẫn chạy tốt (không ảnh hưởng)
    sentences = [torch.tensor(item[0], dtype=torch.long) if not torch.is_tensor(item[0]) else item[0] for item in batch]
    tags = [torch.tensor(item[1], dtype=torch.long) if not torch.is_tensor(item[1]) else item[1] for item in batch]

    # 2. Lấy độ dài thực tế (Length) TRƯỚC khi padding
    # Đây là cái bạn đang thiếu
    lengths = torch.tensor([len(s) for s in sentences], dtype=torch.long)

    # 3. Thực hiện Padding
    padding_value = 0
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=padding_value)
    padded_tags = pad_sequence(tags, batch_first=True, padding_value=padding_value)

    # 4. Trả về đủ 3 giá trị
    return padded_sentences, padded_tags, lengths

In [8]:
# train loader
train_file = r'/home/manh/code/nlp/src/data/UD_English-EWT/en_ewt-ud-train.conllu'
train_data = load_conllu(train_file)
X_train , Y_train = transform_data(train_data , word_idx, tag_idx)
train_dataset = POSDataset(X_train , Y_train)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size = 64 ,
    shuffle = True,
    collate_fn = collate_fn
)

val_loader

In [9]:
dev_file = r'//home/manh/code/nlp/src/data/UD_English-EWT/en_ewt-ud-dev.conllu'
dev_data = load_conllu(dev_file)
X_dev , Y_dev = transform_data(dev_data , word_idx, tag_idx)
dev_dataset = POSDataset(X_dev , Y_dev)
dev_loader = DataLoader(
    dataset=dev_dataset,
    batch_size = 64 ,
    shuffle = True,
    collate_fn = collate_fn
)

test_loader

In [10]:
test_file = r'/home/manh/code/nlp/src/data/UD_English-EWT/en_ewt-ud-test.conllu'
test_data = load_conllu(test_file)
X_test, Y_test = transform_data(test_data , word_idx, tag_idx)
test_dataset = POSDataset(X_test , Y_test)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size = 64 ,
    shuffle = True,
    collate_fn = collate_fn
)

## build model

In [11]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class RNN(nn.Module):
    def __init__(self, vocab_size , embedding_dim, hidden_size, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size , embedding_dim , padding_idx=0)
        self.rnn = nn.RNN(embedding_dim , hidden_size , batch_first = True)
        self.fc = nn.Linear(hidden_size , output_size)
    def forward(self , x , length):
        x = self.embedding(x)
        packed = pack_padded_sequence(x, length, enforce_sorted=False , batch_first=True)
        packed_output , h_n = self.rnn(packed)
        rnn_out, _ = pad_packed_sequence(packed_output, batch_first=True)
        logit = self.fc(rnn_out)
        return logit



train model

In [12]:
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir="run/posTagging")
def fit(model , train_loader , val_loader , optimizer , criterion  , device = "cuda" , epochs = 200):
    model.to(device)
    for epoch in range(epochs):
        correct_train = 0
        total_train = 0
        loss_train = 0.0
        model.train()
        for x ,y , length in train_loader:
            x , y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model.forward(x , length)
            #outshape (batch size , seq_len , num_class )

            # cross entropy loss require
            # input shape ( batch size * sqe_len  , num_class)
            # taget (batch_size * sqe_len)
            out_reshaped = out.view(-1 , out.shape[-1])
            y_reshaped = y.view(-1)
            loss = criterion(out_reshaped, y_reshaped)
            loss.backward()
            optimizer.step()
            # loss , acc calculate
            _ , predicted = torch.max(out , dim=2)
            correct_train += (predicted==y).sum().item()
            loss_train += loss.item()
            mask = (y!=0)
            total_train += mask.sum().item()

        acc_avg = correct_train / total_train
        loss_avg = loss_train / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {loss_avg:.4f} | Acc: {acc_avg:.4f}")

        # (Optional) Thêm phần đánh giá trên dev_loader ở đây...
        writer.add_scalar("Loss/train", loss_avg, epoch)
        writer.add_scalar("Accuracy/train", acc_avg, epoch)
        # val test
        correct_val = 0
        total_val = 0
        loss_val= 0.0
        model.eval()
        for x , y , length in val_loader:
            x , y = x.to(device), y.to(device)
            out = model.forward(x , length)
            out_reshaped = out.view(-1 , out.shape[-1])
            y_reshaped = y.view(-1)
            loss = criterion(out_reshaped, y_reshaped)
            _ , predicted = torch.max(out , dim=2)

            correct_val += (predicted==y).sum().item()
            loss_val += loss.item()
            mask = (y!=0)
            total_val += mask.sum().item()
        avg_acc_val = correct_val / total_val
        avg_loss_val = loss_val / len(val_loader)
        print(f"Epoch {epoch+1}/{epochs} | Loss_val: {avg_loss_val:.4f} | Acc_val: {avg_acc_val:.4f}")

        # (Optional) Thêm phần đánh giá trên dev_loader ở đây...
        writer.add_scalar("Loss/val", avg_loss_val, epoch)
        writer.add_scalar("Accuracy/val", avg_acc_val, epoch)





Hàm evaluate cho tập test tính accuracy và loss 

In [30]:
def evaluate(model , dev_loader , optimizer , criterion , device = "cuda" ):
    total_cor = 0
    loss_test = 0.0
    total = 0

    model.to(device)
    model.eval()
    with torch.no_grad():
        for x , y , length in dev_loader:
            x , y = x.to(device), y.to(device)
            out = model.forward(x , length)
            out_reshaped = out.view(-1 , out.shape[-1])
            y_reshaped = y.view(-1)
            loss = criterion(out_reshaped, y_reshaped)

            _ , predicted = torch.max(out , dim=2)
            total_cor += (predicted == y).sum().item()
            total += (y!=0).sum().item()
            loss_test += loss.item()
        avg_loss = loss_test / len(dev_loader)
        avg_acc = total_cor / total
        print(f"test loss  {avg_loss:.4f} | test acc: {avg_acc:.4f}")




In [14]:
model = RNN(vocab_size=20000 , embedding_dim=128 , hidden_size=128 , output_size=18)
optimizer = Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)
fit(model , train_loader ,dev_loader , optimizer , criterion , device = "cuda" , epochs=30)

Epoch 1/30 | Loss: 1.2593 | Acc: 0.6267
Epoch 1/30 | Loss_val: 0.8874 | Acc_val: 0.7286
Epoch 2/30 | Loss: 0.6873 | Acc: 0.7826
Epoch 2/30 | Loss_val: 0.7154 | Acc_val: 0.7881
Epoch 3/30 | Loss: 0.5201 | Acc: 0.8358
Epoch 3/30 | Loss_val: 0.6351 | Acc_val: 0.8184
Epoch 4/30 | Loss: 0.4169 | Acc: 0.8679
Epoch 4/30 | Loss_val: 0.5974 | Acc_val: 0.8343
Epoch 5/30 | Loss: 0.3435 | Acc: 0.8909
Epoch 5/30 | Loss_val: 0.5750 | Acc_val: 0.8463
Epoch 6/30 | Loss: 0.2882 | Acc: 0.9085
Epoch 6/30 | Loss_val: 0.5838 | Acc_val: 0.8562
Epoch 7/30 | Loss: 0.2447 | Acc: 0.9222
Epoch 7/30 | Loss_val: 0.5825 | Acc_val: 0.8609
Epoch 8/30 | Loss: 0.2088 | Acc: 0.9344
Epoch 8/30 | Loss_val: 0.5897 | Acc_val: 0.8658
Epoch 9/30 | Loss: 0.1798 | Acc: 0.9430
Epoch 9/30 | Loss_val: 0.6187 | Acc_val: 0.8662
Epoch 10/30 | Loss: 0.1551 | Acc: 0.9512
Epoch 10/30 | Loss_val: 0.6447 | Acc_val: 0.8685
Epoch 11/30 | Loss: 0.1347 | Acc: 0.9576
Epoch 11/30 | Loss_val: 0.7015 | Acc_val: 0.8705
Epoch 12/30 | Loss: 0.1164 |

In [31]:
evaluate(model , test_loader , optimizer , criterion , device = "cuda")

test loss  1.1952 | test acc: 0.8667


In [27]:
sentence = "i love nlp".split()

# 1) Chuyển từ sang index
idxs = [word_idx.get(w, 1) for w in sentence]

# 2) Tensor index (GPU)
tensor = torch.tensor(idxs, dtype=torch.long).unsqueeze(0).to('cuda')

# 3) Tensor lengths (CPU, 1D)
lengths = torch.tensor([len(idxs)], dtype=torch.long)   

# 4) Run model
output = model(tensor, lengths)
y = torch.max(output , dim=2)
y


torch.return_types.max(
values=tensor([[16.3601, 21.0545, 13.9500]], device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([[9, 5, 4]], device='cuda:0'))

Mapping kết quả ta được .
 | Từ       | POS (nhãn)     | Index trong word_idx |
| -------- | -------------- | -------------------- |
| **i**    | PRON (đại từ)  | **9**                |
| **love** | VERB (động từ) | **5**                |
| **nlp**  | NOUN (danh từ) | **4**                |

vậy kết quả thu được là hoàn toàn chính xác 


Đánh giá kết quả  mô hình 

| Tập dữ liệu             | Loss       | Accuracy   |
| ----------------------- | ---------- | ---------- |
| **Train (Epoch 30/30)** | **0.0149** | **0.9952** |
| **Validation**          | **1.1780** | **0.8667** |
| **Test**                | **1.1952** | **0.8667** |
