In [11]:
# Các thư viện, package cần thiết:
!pip install conllu
!pip install torch



In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset
from conllu import parse
from conllu import parse_incr
from collections import defaultdict
from sklearn.model_selection import train_test_split
import numpy as np
import os
import random

In [30]:
# Tổng hợp dữ liệu từ file .conllu trả về một danh sách các câu.
def load_conllu_data(data_path):
    all_sentences = []
    for file_name in os.listdir(data_path):
        if file_name.endswith('.conllu'):
            with open(os.path.join(data_path, file_name), 'r', encoding='utf-8') as file:
                for sentence in parse_incr(file):
                    all_sentences.append(sentence)
    return all_sentences

data_path = './data/UD_Vietnamese-VTB'
all_sentences = load_conllu_data(data_path)

print(f"Total sentences collected: {len(all_sentences)}")


Total sentences collected: 3323


In [31]:
# Xây dựng từ vựng và tập hợp nhãn từ các câu đã đọc.
def build_vocab(sentences):
    word_vocab = defaultdict(lambda: len(word_vocab))
    tag_vocab = defaultdict(lambda: len(tag_vocab))

    for sentence in sentences:
        for token in sentence:
            word = token['form']
            tag = token['upostag']
            word_vocab[word]
            tag_vocab[tag]

    # Đóng băng từ vựng để tránh thêm từ mới
    word_vocab.default_factory = None
    tag_vocab.default_factory = None

    return word_vocab, tag_vocab

word_vocab, tag_vocab = build_vocab(all_sentences)
tag2idx = {tag: idx for idx, tag in enumerate(tag_vocab)}

print(f"Vocabulary size: {len(word_vocab)}, Tag set size: {len(tag_vocab)}")


Vocabulary size: 7488, Tag set size: 17


In [32]:
#Chuyển đổi mỗi câu thành một tensor, bao gồm các từ và nhãn tương ứng.
def sentence_to_tensor(sentence, word_vocab, tag_vocab):
    words = [word_vocab[token['form']] for token in sentence]
    tags = [tag_vocab[token['upostag']] for token in sentence]
    return torch.tensor(words, dtype=torch.long), torch.tensor(tags, dtype=torch.long)

sentences_tensors = [sentence_to_tensor(sentence, word_vocab, tag_vocab) for sentence in all_sentences]
train_data, test_data = train_test_split(sentences_tensors, test_size=0.2, random_state=26)

print(f"Training data size: {len(train_data)}, Testing data size: {len(test_data)}")


Training data size: 2658, Testing data size: 665


In [33]:
# Tạo hàm collate_fn tùy chỉnh để padding các tensor
def collate_fn(batch):
    words, tags = zip(*batch)
    words_padded = pad_sequence(words, batch_first=True, padding_value=0)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=-1)  # -1 cho padding
    lengths = [len(seq) for seq in words]
    return words_padded, tags_padded, lengths

# Tạo Dataset tùy chỉnh
class DependencyParsingDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Tạo DataLoader với hàm collate_fn tùy chỉnh
train_dataset = DependencyParsingDataset(train_data)
test_dataset = DependencyParsingDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


In [34]:
# Xây dựng mô hình BiLSTMParser
class BiLSTMParser(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMParser, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        x = self.embedding(x)
        x = self.dropout(x)
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.fc(output)
        return output

In [35]:
# Hàm huấn luyện mô hình
def train_model(model, train_loader, criterion, optimizer, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for words, tags, lengths in train_loader:
            optimizer.zero_grad()
            outputs = model(words, lengths)

            # Chuyển đổi các tensor để tính toán loss chính xác
            outputs = outputs.view(-1, output_dim)
            tags = tags.view(-1)

            loss = criterion(outputs, tags)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")
        scheduler.step()
        

In [36]:
def calculate_metrics(predicted, tags):
    true_positives = (predicted == tags).sum().item()
    false_positives = (predicted != tags).sum().item()
    false_negatives = (predicted != tags).sum().item()
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1_score


In [None]:
# Khởi tạo mô hình, hàm loss và optimizer
vocab_size = len(word_vocab)
embedding_dim = 200
hidden_dim = 256
output_dim = len(tag_vocab)

model = BiLSTMParser(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Bỏ qua giá trị padding
optimizer = optim.Adam(model.parameters(),lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) #cập nhật learning rate tăng tính hội tụ

# Huấn luyện mô hình
train_model(model, train_loader, criterion, optimizer)


Epoch 1, Loss: 0.9288103168918973
Epoch 2, Loss: 0.33178286751111347
Epoch 3, Loss: 0.18008885780970255
Epoch 4, Loss: 0.10787290228264672
Epoch 5, Loss: 0.06870698946572486


In [26]:
def evaluate_model(model, test_loader):
    model.eval()
    total_correct = 0
    total_tags = 0
    with torch.no_grad():
        for words, tags, lengths in test_loader:
            outputs = model(words, lengths)
            outputs = outputs.view(-1, output_dim)
            tags = tags.view(-1)
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == tags).sum().item()
            total_tags += tags.size(0)
    accuracy = total_correct / total_tags
    precision, recall, f1_score = calculate_metrics(predicted, tags)
    return accuracy, precision, recall, f1_score
accuracy, precision, recall, f1_score = evaluate_model(model, test_loader)
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}")


Accuracy: 0.2593, Precision: 0.4507, Recall: 0.4507, F1-score: 0.4507


In [27]:
import matplotlib
matplotlib.use('Qt5Agg')  
import matplotlib.pyplot as plt
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
values = [accuracy, precision, recall, f1_score]

# Vẽ đồ thị
plt.bar(metrics, values)
plt.xlabel('Metrics')
plt.ylabel('Value')
plt.title('Evaluation Metrics')
plt.ylim(0, 1)
plt.show()

In [28]:
# Vẽ cây
import nltk
from nltk import Tree

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

def conllu_to_nltk_tree(sentence):
    def token_to_nltk_tree(token):
        word = token['form']
        head = int(token['head'])
        deprel = token['deprel']
        upos = token['upostag']  # Thêm nhãn từ loại (POS tag)
        return (word, upos, head, deprel)

    def build_tree(tokens):
        root = None
        token_dict = {token['id']: token for token in tokens}
        children = {token['id']: [] for token in tokens}

        for token in tokens:
            head_id = token['head']
            if head_id == 0:
                root = token['id']
            else:
                children[head_id].append(token['id'])

        def create_tree_node(token_id):
            token = token_dict[token_id]
            word, upos, _, _ = token_to_nltk_tree(token)  # Lấy thông tin từ loại (POS tag)
            subtree = [create_tree_node(child_id) for child_id in children[token_id]]
            return Tree(upos + "(" + word + ")", subtree)  # Chèn nhãn từ loại vào cây

        return create_tree_node(root)

    nltk_tree = build_tree([token for token in sentence])
    return nltk_tree

# Vẽ cây cho câu đầu tiên
import random
example_sentence = all_sentences[random.randint(1, 1000)]
nltk_tree = conllu_to_nltk_tree(example_sentence)

print("Câu mẫu:", " " .join([token['form'] for token in example_sentence]))

# Vẽ cây
nltk_tree.pretty_print()
plt.figure(figsize=(7,7))

nltk_tree.draw()
plt.savefig("dependency_tree.png") 

Câu mẫu: Các loại thú rừng được nhốt trong bao , lồng sắt .
                                         VERB(nhốt)                                        
              _______________________________|________________________________________      
         NOUN(loại)                |                NOUN(bao)                         |    
    _________|__________           |          __________|_________                    |     
   |                NOUN(thú)      |         |                NOUN(lồng)              |    
   |                    |          |         |           _________|__________         |     
DET(Các)            NOUN(rừng) AUX(được) ADP(trong)  PUNCT(,)            NOUN(sắt) PUNCT(.)
   |                    |          |         |          |                    |        |     
  ...                  ...        ...       ...        ...                  ...      ...   

