In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from collections import defaultdict
import numpy as np


In [4]:
def load_conll(path):
    sentences, tags = [], []
    sentence, tag_seq = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # ngắt câu
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag_seq)
                    sentence, tag_seq = [], []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, label = parts
                    sentence.append(token)
                    tag_seq.append(label)
    return sentences, tags

train_sentences, train_tags = load_conll("D:/Code/Week_3/train_word.conll")
test_sentences, test_tags = load_conll("D:/Code/Week_3/test_word.conll")

print(f"Train samples: {len(train_sentences)}, Test samples: {len(test_sentences)}")


Train samples: 5027, Test samples: 3000


In [9]:
tags = set()
for tag_seq in train_tags:
    for t in tag_seq:
        tags.add(t)

print(tags)

{'B-LOCATION', 'I-ORGANIZATION', 'B-DATE', 'B-GENDER', 'B-ORGANIZATION', 'B-PATIENT_ID', 'B-NAME', 'I-LOCATION', 'I-DATE', 'I-AGE', 'I-PATIENT_ID', 'B-JOB', 'I-NAME', 'B-TRANSPORTATION', 'B-AGE', 'I-JOB', 'I-TRANSPORTATION', 'O', 'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE'}


In [10]:
def build_vocab(sentences, tags):
    word2idx = {"<PAD>": 0, "<UNK>": 1}
    tag2idx = {"<PAD>": 0}

    for s in sentences:
        for w in s:
            if w not in word2idx:
                word2idx[w] = len(word2idx)

    for tg_seq in tags:
        for t in tg_seq:
            if t not in tag2idx:
                tag2idx[t] = len(tag2idx)

    idx2tag = {v: k for k, v in tag2idx.items()}
    return word2idx, tag2idx, idx2tag

word2idx, tag2idx, idx2tag = build_vocab(train_sentences + test_sentences, train_tags + test_tags)
print("Vocab size:", len(word2idx), "  Tag size:", len(tag2idx))


Vocab size: 7254   Tag size: 21


In [11]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word2idx, tag2idx, max_len=50):
        self.sentences = sentences
        self.tags = tags
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def encode_sentence(self, sentence):
        ids = [self.word2idx.get(w, 1) for w in sentence]  # 1 = <UNK>
        if len(ids) < self.max_len:
            ids += [0] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return ids

    def encode_tags(self, tags):
        ids = [self.tag2idx[t] for t in tags]
        if len(ids) < self.max_len:
            ids += [0] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return ids

    def __getitem__(self, idx):
        x = torch.tensor(self.encode_sentence(self.sentences[idx]), dtype=torch.long)
        y = torch.tensor(self.encode_tags(self.tags[idx]), dtype=torch.long)
        return x, y

train_data = NERDataset(train_sentences, train_tags, word2idx, tag2idx)
test_data = NERDataset(test_sentences, test_tags, word2idx, tag2idx)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)


In [12]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=300, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        emb = self.embedding(x)
        output, _ = self.lstm(emb)
        logits = self.fc(output)
        return logits

model = LSTMTagger(len(word2idx), len(tag2idx))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


LSTMTagger(
  (embedding): Embedding(7254, 300, padding_idx=0)
  (lstm): LSTM(300, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=21, bias=True)
)

In [13]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs.view(-1, len(tag2idx)), y_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 0.8393
Epoch 2, Loss: 0.2356
Epoch 3, Loss: 0.1346
Epoch 4, Loss: 0.0948
Epoch 5, Loss: 0.0706


In [14]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        outputs = model(x_batch)
        preds = outputs.argmax(dim=-1)

        for i in range(len(x_batch)):
            for j in range(len(x_batch[i])):
                if y_batch[i, j] != 0:  # bỏ PAD
                    y_true.append(idx2tag[y_batch[i, j].item()])
                    y_pred.append(idx2tag[preds[i, j].item()])

print(classification_report(y_true, y_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                       precision    recall  f1-score   support

                <PAD>       0.00      0.00      0.00         0
                B-AGE       0.93      0.81      0.86       554
               B-DATE       0.95      0.94      0.94      1617
             B-GENDER       0.93      0.84      0.89       438
                B-JOB       0.76      0.38      0.51       172
           B-LOCATION       0.88      0.85      0.87      4327
               B-NAME       0.92      0.41      0.57       304
       B-ORGANIZATION       0.86      0.73      0.79       761
         B-PATIENT_ID       0.94      0.88      0.91      1913
B-SYMPTOM_AND_DISEASE       0.94      0.71      0.81      1113
     B-TRANSPORTATION       0.89      0.73      0.80       191
                I-AGE       0.00      0.00      0.00         6
               I-DATE       0.96      0.97      0.97      1680
                I-JOB       1.00      0.03      0.05       114
           I-LOCATION       0.90      0.81      0.86  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
def predict_sentence(model, sentence):
    model.eval()
    x = torch.tensor([word2idx.get(w, 1) for w in sentence], dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        preds = model(x).argmax(dim=-1).squeeze(0)
    return [(w, idx2tag[p.item()]) for w, p in zip(sentence, preds)]

test_sentence = ["bệnh_nhân", "được", "đưa", "đến", "bệnh_viện", "Bạch_Mai", "ngày", "24", "-", "7"]
print(predict_sentence(model, test_sentence))


[('bệnh_nhân', 'O'), ('được', 'O'), ('đưa', 'O'), ('đến', 'O'), ('bệnh_viện', 'O'), ('Bạch_Mai', 'I-LOCATION'), ('ngày', 'O'), ('24', 'B-DATE'), ('-', 'I-DATE'), ('7', 'I-DATE')]
