In [47]:
import os
import torch
import torch.nn as nn
from TorchCRF import CRF

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [33]:
def read_data(file_path):
    sentences, sentence = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                word, pos, chunk, ner = line.split()
                sentence.append((word, pos, chunk, ner))
        if sentence:
            sentences.append(sentence)  # Thêm câu cuối cùng
    return sentences

In [34]:
class NERDataset(Dataset):
    def __init__(self, sentences, word2idx, tag2idx, max_len):
        self.sentences = sentences
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        words = [word for word, _, _, _ in sentence]
        tags = [tag for _, _, _, tag in sentence]

        word_indices = [self.word2idx.get(w, self.word2idx["<UNK>"]) for w in words]
        tag_indices = [self.tag2idx.get(t, self.tag2idx["O"]) for t in tags]

        # Padding
        word_indices = word_indices[:self.max_len] + [self.word2idx["<PAD>"]] * (self.max_len - len(word_indices))
        tag_indices = tag_indices[:self.max_len] + [self.tag2idx["O"]] * (self.max_len - len(tag_indices))

        mask = [1 if i != self.word2idx["<PAD>"] else 0 for i in word_indices]

        return torch.tensor(word_indices), torch.tensor(tag_indices), torch.tensor(mask)

In [35]:
def create_vocab(sentences):
    words = set()
    tags = set()
    for sentence in sentences:
        for word, _, _, tag in sentence:
            words.add(word)
            tags.add(tag)
    word2idx = {word: idx for idx, word in enumerate(sorted(words), start=2)}
    word2idx["<PAD>"] = 0
    word2idx["<UNK>"] = 1
    tag2idx = {tag: idx for idx, tag in enumerate(sorted(tags))}
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

In [36]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.hidden_dim = hidden_dim
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, words, tags=None, mask=None):
        embeddings = self.embedding(words)
        lstm_out, _ = self.lstm(embeddings)
        logits = self.hidden2tag(lstm_out)
        
        if mask is not None:
            mask = mask.bool()
        
        if tags is not None:
            loss = -self.crf(logits, tags, mask=mask, reduction="mean")
            return loss
        else:
            predictions = self.crf.decode(logits, mask=mask)
            return predictions

In [37]:
def train_model(model, train_loader, val_loader, optimizer, epochs, device):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for words, tags, mask in tqdm(train_loader, unit='batch', desc=f"Training {epoch + 1}/{epochs}"):
            words, tags, mask = words.to(device), tags.to(device), mask.to(device)
            loss = model(words, tags, mask)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"|----> Loss: {total_loss:.4f}")
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for words, tags, mask in val_loader:
                words, tags, mask = words.to(device), tags.to(device), mask.to(device)
                val_loss += model(words, tags, mask).item()
        print(f"   |----> Validation Loss: {val_loss:.4f}")

In [38]:
def evaluate_model(model, test_loader, idx2tag, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for words, tags, mask in test_loader:
            words, tags, mask = words.to(device), tags.to(device), mask.to(device)
            predictions = model(words, mask=mask)
            for pred, true, m in zip(predictions, tags, mask):
                all_preds.extend([idx2tag[p] for p, mask_val in zip(pred, m) if mask_val])
                all_labels.extend([idx2tag[t.item()] for t, mask_val in zip(true, m) if mask_val])
    print(classification_report(all_labels, all_preds))

In [39]:
train_file = 'data/eng/eng.train'
val_file = 'data/eng/eng.testa'
test_file = 'data/eng/eng.testb'

train_sentences = read_data(train_file)
val_sentences = read_data(val_file)
test_sentences = read_data(test_file)


word2idx, tag2idx, idx2tag = create_vocab(train_sentences)

max_len = 50
batch_size = 32

train_dataset = NERDataset(train_sentences, word2idx, tag2idx, max_len)
val_dataset = NERDataset(val_sentences, word2idx, tag2idx, max_len)
test_dataset = NERDataset(test_sentences, word2idx, tag2idx, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
embedding_dim=100
hidden_dim=128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word2idx), len(tag2idx), embedding_dim=embedding_dim, hidden_dim=hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, val_loader, optimizer, epochs=10, device=device)

evaluate_model(model, test_loader, idx2tag, device)


Training 1/10: 100%|██████████| 469/469 [00:34<00:00, 13.40batch/s]


|----> Loss: 2961.7138
   |----> Validation Loss: 476.8134


Training 2/10: 100%|██████████| 469/469 [00:36<00:00, 12.77batch/s]


|----> Loss: 1314.9378
   |----> Validation Loss: 341.3157


Training 3/10: 100%|██████████| 469/469 [00:35<00:00, 13.20batch/s]


|----> Loss: 734.3369
   |----> Validation Loss: 280.7171


Training 4/10: 100%|██████████| 469/469 [00:36<00:00, 12.97batch/s]


|----> Loss: 411.0350
   |----> Validation Loss: 263.7713


Training 5/10: 100%|██████████| 469/469 [00:35<00:00, 13.09batch/s]


|----> Loss: 218.3527
   |----> Validation Loss: 269.7187


Training 6/10: 100%|██████████| 469/469 [00:37<00:00, 12.61batch/s]


|----> Loss: 108.8634
   |----> Validation Loss: 275.0793


Training 7/10: 100%|██████████| 469/469 [00:36<00:00, 12.80batch/s]


|----> Loss: 55.2397
   |----> Validation Loss: 298.9442


Training 8/10: 100%|██████████| 469/469 [00:36<00:00, 12.82batch/s]


|----> Loss: 27.4958
   |----> Validation Loss: 308.8414


Training 9/10: 100%|██████████| 469/469 [00:36<00:00, 12.74batch/s]


|----> Loss: 15.9794
   |----> Validation Loss: 354.0932


Training 10/10: 100%|██████████| 469/469 [00:36<00:00, 12.99batch/s]


|----> Loss: 10.4698
   |----> Validation Loss: 360.9075
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         6
      B-MISC       0.33      0.11      0.17         9
       B-ORG       0.00      0.00      0.00         5
       I-LOC       0.82      0.67      0.74      1905
      I-MISC       0.81      0.61      0.69       908
       I-ORG       0.43      0.80      0.56      2480
       I-PER       0.87      0.70      0.77      2691
           O       0.98      0.95      0.96     38378

    accuracy                           0.91     46382
   macro avg       0.53      0.48      0.49     46382
weighted avg       0.93      0.91      0.92     46382



In [None]:
# torch.save(model.state_dict(), "save/models/bilstm_crf.pth")

model = BiLSTM_CRF(len(word2idx), len(tag2idx), embedding_dim=100, hidden_dim=128)
model.load_state_dict(torch.load("save/models/bilstm_crf.pth"))
model.to(device)


BiLSTM_CRF(
  (embedding): Embedding(23626, 100)
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (hidden2tag): Linear(in_features=256, out_features=8, bias=True)
  (crf): CRF(num_tags=8)
)

In [None]:
def evaluate_model_by_accuracy(model, test_loader, idx2tag, device):
    model.to(device)
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for words, tags, mask in test_loader:
            words, tags, mask = words.to(device), tags.to(device), mask.to(device)
            mask = mask.bool() 
            predictions = model(words, mask=mask)
            
            for i in range(len(words)):
                true_tags = tags[i][mask[i]].cpu().numpy()
                pred_tags = predictions[i]
                
                all_labels.extend(true_tags)
                all_preds.extend(pred_tags)

    all_labels_filtered = [l for l in all_labels if idx2tag[l] != 'O']
    all_preds_filtered = [p for l, p in zip(all_labels, all_preds) if idx2tag[l] != 'O']

    overall_accuracy = accuracy_score(all_labels, all_preds)
    filtered_accuracy = accuracy_score(all_labels_filtered, all_preds_filtered)

    return overall_accuracy, filtered_accuracy


In [None]:
overall_acc, filtered_acc = evaluate_model_by_accuracy(model, test_loader, idx2tag, device)
print(f"Overall Accuracy: {overall_acc * 100:.2f}%")
print(f"Filtered Accuracy (excluding 'O' tags): {filtered_acc * 100:.2f}%")


Overall Accuracy: 90.94%
Filtered Accuracy (excluding 'O' tags): 71.25%
