In [64]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.metrics import classification_report
from tqdm import tqdm
import pickle

import warnings
warnings.filterwarnings('ignore')

In [65]:
def build_vocab(file_path):
    word2idx = defaultdict(lambda: len(word2idx))
    tag2idx = defaultdict(lambda: len(tag2idx))
    word2idx["<PAD>"] = 0
    word2idx["<UNK>"] = 1
    # tag2idx["O"] = 0
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():
                parts = line.strip().split(' ')
                word, tag = parts[0], parts[3]
                word2idx[word]
                tag2idx[tag]
    return dict(word2idx), dict(tag2idx)

In [66]:
# Build vocab for POS and Chunk
def build_aux_vocab(data_file):
    pos_vocab, chunk_vocab = {"<PAD>": 0, "<UNK>": 1}, {"<PAD>": 0, "<UNK>": 1}
    with open(data_file, 'r') as f:
        for line in f:
            if line.strip():
                _, pos, chunk, _ = line.split()
                if pos not in pos_vocab:
                    pos_vocab[pos] = len(pos_vocab)
                if chunk not in chunk_vocab:
                    chunk_vocab[chunk] = len(chunk_vocab)
    return pos_vocab, chunk_vocab


In [67]:
class NERDataset(Dataset):
    def __init__(self, file_path, word2idx, tag2idx, pos2idx, chunk2idx, max_len):
        self.sentences, self.labels, self.pos_tags, self.chunks = [], [], [], []
        sentence, labels, pos_tags, chunks = [], [], [], []

        with open(file_path, 'r') as f:
            for line in f:
                if line.strip():
                    word, pos, chunk, tag = line.split()
                    sentence.append(word2idx.get(word, word2idx["<UNK>"]))
                    labels.append(tag2idx[tag])
                    pos_tags.append(pos2idx.get(pos, pos2idx["<UNK>"]))
                    chunks.append(chunk2idx.get(chunk, chunk2idx["<UNK>"]))
                else:
                    if sentence:
                        self.sentences.append(self.pad_or_truncate(sentence, max_len))
                        self.labels.append(self.pad_or_truncate(labels, max_len))
                        self.pos_tags.append(self.pad_or_truncate(pos_tags, max_len))
                        self.chunks.append(self.pad_or_truncate(chunks, max_len))
                    sentence, labels, pos_tags, chunks = [], [], [], []

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = torch.tensor(self.sentences[idx])
        labels = torch.tensor(self.labels[idx])
        pos_tags = torch.tensor(self.pos_tags[idx])
        chunks = torch.tensor(self.chunks[idx])
        return sentence, pos_tags, chunks, labels

    def pad_or_truncate(self, sequence, max_len):
        # Padding
        if len(sequence) < max_len:
            sequence = sequence + [0] * (max_len - len(sequence))
        # Truncation
        if len(sequence) > max_len:
            sequence = sequence[:max_len]
        return sequence


In [68]:
# class NERDataset(Dataset):
#     def __init__(self, file_path, word2idx, tag2idx, max_len):
#         self.sentences, self.labels = self._read_data(file_path)
#         self.word2idx = word2idx
#         self.tag2idx = tag2idx
#         self.max_len = max_len

#     def _read_data(self, file_path):
#         sentences, labels = [], []
#         sentence, label = [], []
#         with open(file_path, 'r') as f:
#             for line in f:
#                 if line.strip() == "":
#                     if sentence:
#                         sentences.append(sentence)
#                         labels.append(label)
#                         sentence, label = [], []
#                 else:
#                     parts = line.strip().split()
#                     word, tag = parts[0], parts[3]
#                     sentence.append(word)
#                     label.append(tag)
#         if sentence:
#             sentences.append(sentence)
#             labels.append(label)
#         return sentences, labels

#     def __len__(self):
#         return len(self.sentences)

#     def __getitem__(self, idx):
#         sentence = self.sentences[idx]
#         label = self.labels[idx]

#         # Convert to indices
#         word_indices = [self.word2idx.get(w, self.word2idx["<PAD>"]) for w in sentence]
#         tag_indices = [self.tag2idx.get(t, self.tag2idx["O"]) for t in label]

#         # Pad sequences
#         word_indices = word_indices[:self.max_len] + [self.word2idx["<PAD>"]] * (self.max_len - len(word_indices))
#         tag_indices = tag_indices[:self.max_len] + [self.tag2idx["O"]] * (self.max_len - len(tag_indices))

#         return torch.tensor(word_indices), torch.tensor(tag_indices)


In [69]:
# 2. Load Pretrained Embeddings and Prepare Dictionaries
def load_pretrained_embeddings(file_path, word2idx):
    embedding_matrix = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            if word in word2idx:
                embedding_matrix.append(vector)
    return np.array(embedding_matrix)


def load_pretrained_embeddings(file_path, embedding_type='glove'):
    embeddings = {}

    if embedding_type == 'glove':  # GloVe Format
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector

    elif embedding_type == 'word2vec':  # Word2Vec Text Format
        with open(file_path, 'r', encoding='utf-8') as f:
            next(f)  # Skip the first line (header line with vocab size, dimensions)
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector

    elif embedding_type == 'fasttext':  # FastText Format
        with open(file_path, 'r', encoding='utf-8') as f:
            next(f)  # Skip the first line (header line)
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector

    else:
        raise ValueError(f"Unsupported embedding type: {embedding_type}")

    return embeddings

def create_embedding_matrix(vocab, pretrained_embeddings, embedding_dim, unk_token='<UNK>'):
    vocab_size = len(vocab)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, idx in vocab.items():
        if word in pretrained_embeddings:
            embedding_matrix[idx] = pretrained_embeddings[word]
        else:
            embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))  # Random for unknowns

    # Handle the unknown token if present
    if unk_token in vocab:
        embedding_matrix[vocab[unk_token]] = np.random.normal(size=(embedding_dim,))

    return torch.tensor(embedding_matrix, dtype=torch.float32)

In [70]:
# class BiLSTMNER(nn.Module):
#     def __init__(self, vocab_size, tag_size, embed_dim, hidden_dim):
#         super(BiLSTMNER, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_dim * 2, tag_size)

#     def forward(self, x):
#         embeds = self.embedding(x)
#         lstm_out, _ = self.lstm(embeds)
#         logits = self.fc(lstm_out)
#         return logits

In [71]:
class BiLSTMNER(nn.Module):
    def __init__(self, vocab_size, pos_size, chunk_size, tagset_size, embed_dim, pos_dim, chunk_dim, hidden_dim):
        super(BiLSTMNER, self).__init__()
        self.word_embeds = nn.Embedding(vocab_size, embed_dim)
        self.pos_embeds = nn.Embedding(pos_size, pos_dim)
        self.chunk_embeds = nn.Embedding(chunk_size, chunk_dim)
        self.lstm = nn.LSTM(embed_dim + pos_dim + chunk_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, words, pos_tags, chunks):
        word_embeds = self.word_embeds(words)
        pos_embeds = self.pos_embeds(pos_tags)
        chunk_embeds = self.chunk_embeds(chunks)
        combined = torch.cat((word_embeds, pos_embeds, chunk_embeds), dim=-1)
        lstm_out, _ = self.lstm(combined)
        logits = self.fc(lstm_out)
        return logits


In [72]:
def train_model(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(loader, desc="Training", unit="batch"):
        words, pos_tags, chunks, labels = batch
        
        words = words.to(device)
        pos_tags = pos_tags.to(device)
        chunks = chunks.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()

        # Forward pass
        outputs = model(words, pos_tags, chunks)
        
        # Flatten outputs and labels for loss calculation
        loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    return avg_loss


In [73]:
def evaluate_model(model, loader, idx2tag, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            words, pos_tags, chunks, labels = batch
            words = words.to(device)
            pos_tags = pos_tags.to(device)
            chunks = chunks.to(device)
            labels = labels.to(device)
            
            outputs = model(words, pos_tags, chunks)
            predictions = torch.argmax(outputs, dim=-1)
            all_preds.extend(predictions.view(-1).tolist())
            all_labels.extend(labels.view(-1).tolist())
    
    # Remove padding tokens
    valid_preds = []
    valid_labels = []
    
    for p, l in zip(all_preds, all_labels):
        # Kiểm tra xem nhãn có trong idx2tag không trước khi sử dụng
        # if l in idx2tag and idx2tag[l] != "O":
        valid_preds.append(idx2tag[p])
        valid_labels.append(idx2tag[l])
    
    return classification_report(valid_labels, valid_preds, output_dict=False)


In [74]:
def save_model(model, path, word2idx, pos2idx, chunk2idx, tag2idx):
    torch.save(model.state_dict(), f"{path}.pth")
    with open(f"{path}_vocab.pkl", "wb") as f:
        pickle.dump({"word2idx": word2idx, "pos2idx": pos2idx, "chunk2idx": chunk2idx, "tag2idx": tag2idx}, f)
    print(f"Model and vocab saved to {path}.pth and {path}_vocab.pkl")

def load_model(model_class, path, vocab_path, embed_dim, pos_dim, chunk_dim, hidden_dim, device):
    # Load vocab
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    word2idx, pos2idx, chunk2idx, tag2idx = vocab["word2idx"], vocab["pos2idx"], vocab["chunk2idx"], vocab["tag2idx"]
    
    # Khởi tạo lại mô hình với đầy đủ tham số
    model = model_class(len(word2idx), len(pos2idx), len(chunk2idx), len(tag2idx), embed_dim, pos_dim, chunk_dim, hidden_dim)
    
    # Load state_dict
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    
    return model, word2idx, pos2idx, chunk2idx, tag2idx


def predict(model, text, word2idx, pos2idx, chunk2idx, idx2tag, max_len, device):
    words = text.split()
    # Map words to indices
    word_indices = [word2idx.get(w, word2idx["<PAD>"]) for w in words]
    word_indices = word_indices[:max_len] + [word2idx["<PAD>"]] * (max_len - len(word_indices))

    # Generate dummy POS tags and chunks (e.g., assuming "NN" and "B-NP" for simplicity)
    pos_indices = [pos2idx.get("NN", pos2idx["<PAD>"])] * len(words)
    chunk_indices = [chunk2idx.get("B-NP", chunk2idx["<PAD>"])] * len(words)

    # Pad POS and chunks
    pos_indices = pos_indices[:max_len] + [pos2idx["<PAD>"]] * (max_len - len(pos_indices))
    chunk_indices = chunk_indices[:max_len] + [chunk2idx["<PAD>"]] * (max_len - len(chunk_indices))

    # Prepare tensors
    word_tensor = torch.tensor([word_indices]).to(device)
    pos_tensor = torch.tensor([pos_indices]).to(device)
    chunk_tensor = torch.tensor([chunk_indices]).to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(word_tensor, pos_tensor, chunk_tensor)
        predictions = torch.argmax(outputs, dim=-1).squeeze(0).tolist()

    # Convert predictions to tags
    tags = [idx2tag[idx] for idx in predictions[:len(words)]]
    return list(zip(words, tags))


In [75]:
# Paths
train_file = "data/eng/eng.train"
val_file = "data/eng/eng.testa"
test_file = "data/eng/eng.testb"

# Build vocabulary
pos2idx, chunk2idx = build_aux_vocab("data/eng/eng.train")
word2idx, tag2idx = build_vocab(train_file)
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

In [87]:
pos2idx

{'<PAD>': 0,
 '<UNK>': 1,
 'NNP': 2,
 'VBZ': 3,
 'JJ': 4,
 'NN': 5,
 'TO': 6,
 'VB': 7,
 '.': 8,
 'CD': 9,
 'DT': 10,
 'VBD': 11,
 'IN': 12,
 'PRP': 13,
 'NNS': 14,
 'VBP': 15,
 'MD': 16,
 'VBN': 17,
 'POS': 18,
 'JJR': 19,
 '"': 20,
 'RB': 21,
 ',': 22,
 'FW': 23,
 'CC': 24,
 'WDT': 25,
 '(': 26,
 ')': 27,
 ':': 28,
 'PRP$': 29,
 'RBR': 30,
 'VBG': 31,
 'EX': 32,
 'WP': 33,
 'WRB': 34,
 '-X-': 35,
 '$': 36,
 'RP': 37,
 'NNPS': 38,
 'SYM': 39,
 'RBS': 40,
 'UH': 41,
 'PDT': 42,
 "''": 43,
 'LS': 44,
 'JJS': 45,
 'WP$': 46,
 'NN|SYM': 47}

In [76]:
# Hyperparameters
embed_dim = 100
pos_dim = 50
chunk_dim = 50
hidden_dim = 128
max_len = 50
batch_size = 32
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [77]:
train_dataset = NERDataset(train_file, word2idx, tag2idx, pos2idx, chunk2idx, max_len)
val_dataset = NERDataset(val_file, word2idx, tag2idx, pos2idx, chunk2idx, max_len)
test_dataset = NERDataset(test_file, word2idx, tag2idx, pos2idx, chunk2idx, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [78]:
# # Calculate class weights
# num_tags = len(tag2idx)
# tag_counts = [0] * num_tags
# for _, labels in train_dataset:
#     for tag in labels.tolist():
#         tag_counts[tag] += 1
# total_tags = sum(tag_counts)
# class_weights = [total_tags / count if count > 0 else 0.0 for count in tag_counts]

# # Convert to tensor and move to device
# weights = torch.tensor(class_weights).to(device)

# # Define loss function with weights
# criterion = nn.CrossEntropyLoss(ignore_index=0, weight=weights)

In [79]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, ignore_index=-1):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(ignore_index=self.ignore_index, reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)  # Probabilities of the true class
        focal_loss = self.alpha * ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean()

# Replace criterion with FocalLoss
criterion = FocalLoss(alpha=1, gamma=2, ignore_index=0)


In [80]:
# Model, optimizer, loss
model = BiLSTMNER(len(word2idx), len(pos2idx), len(chunk2idx), len(tag2idx), embed_dim, pos_dim, chunk_dim, hidden_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [81]:
# Train and validate
for epoch in range(epochs):
    print(f"Epoch [{epoch + 1}/{epochs}]")
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f"Training Loss: {train_loss:.4f}")

    print("Validation:")
    val_report = evaluate_model(model, val_loader, idx2tag, device)
    print(val_report)
    print("===================================================================================")

# Final test evaluation
print("Final Test Evaluation:")
test_report = evaluate_model(model, test_loader, idx2tag, device)
print(test_report)

Epoch [1/5]


Training: 100%|██████████| 469/469 [00:03<00:00, 155.56batch/s]


Training Loss: 0.0237
Validation:
              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         4
       I-LOC       0.02      0.79      0.04      2088
      I-MISC       0.58      0.52      0.55      1258
       I-ORG       0.00      0.00      0.00    124126
       I-PER       0.67      0.91      0.77      3053
           O       0.54      0.99      0.70     42721

    accuracy                           0.27    173250
   macro avg       0.30      0.53      0.34    173250
weighted avg       0.15      0.27      0.19    173250

Epoch [2/5]


Training: 100%|██████████| 469/469 [00:02<00:00, 170.43batch/s]


Training Loss: 0.0091
Validation:
              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         4
       I-LOC       0.68      0.87      0.76      2088
      I-MISC       0.60      0.75      0.66      1258
       I-ORG       0.00      0.00      0.00    124126
       I-PER       0.75      0.89      0.81      3053
           O       0.26      0.99      0.41     42721

    accuracy                           0.28    173250
   macro avg       0.38      0.58      0.44    173250
weighted avg       0.09      0.28      0.13    173250

Epoch [3/5]


Training: 100%|██████████| 469/469 [00:02<00:00, 169.73batch/s]


Training Loss: 0.0050
Validation:
              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         4
       I-LOC       0.80      0.80      0.80      2088
      I-MISC       0.69      0.74      0.71      1258
       I-ORG       0.00      0.00      0.00    124126
       I-PER       0.67      0.94      0.78      3053
           O       0.26      0.99      0.41     42721

    accuracy                           0.28    173250
   macro avg       0.40      0.58      0.45    173250
weighted avg       0.09      0.28      0.13    173250

Epoch [4/5]


Training: 100%|██████████| 469/469 [00:02<00:00, 170.13batch/s]


Training Loss: 0.0025
Validation:
              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         4
       I-LOC       0.69      0.78      0.73      2088
      I-MISC       0.61      0.81      0.70      1258
       I-ORG       0.00      0.00      0.00    124126
       I-PER       0.58      0.95      0.72      3053
           O       0.26      0.99      0.41     42721

    accuracy                           0.28    173250
   macro avg       0.36      0.59      0.43    173250
weighted avg       0.09      0.28      0.13    173250

Epoch [5/5]


Training: 100%|██████████| 469/469 [00:02<00:00, 168.82batch/s]


Training Loss: 0.0011
Validation:
              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         4
       I-LOC       0.63      0.87      0.74      2088
      I-MISC       0.44      0.75      0.55      1258
       I-ORG       0.00      0.00      0.00    124126
       I-PER       0.61      0.94      0.74      3053
           O       0.26      0.99      0.41     42721

    accuracy                           0.28    173250
   macro avg       0.32      0.59      0.41    173250
weighted avg       0.09      0.28      0.13    173250

Final Test Evaluation:
              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         6
      B-MISC       0.00      0.00      0.00         9
       B-ORG       0.00      0.00      0.00         5
       I-LOC       0.55      0.82      0.66      1905
      I-MISC       0.34      0.66      0.45       908
       I-ORG       0.00      0.00      0.00    140249
       I-PER       0.

In [82]:
# Save model
model_path = "save/models/bilstm"
save_model(model, model_path, word2idx, pos2idx, chunk2idx, tag2idx)

# Load model and predict
# Load model and predict
loaded_model, loaded_word2idx, loaded_pos2idx, loaded_chunk2idx, loaded_tag2idx = load_model(
    BiLSTMNER, f"{model_path}.pth", f"{model_path}_vocab.pkl", embed_dim, pos_dim, chunk_dim, hidden_dim, device
)
loaded_idx2tag = {idx: tag for tag, idx in loaded_tag2idx.items()}


Model and vocab saved to save/models/bilstm.pth and save/models/bilstm_vocab.pkl


In [83]:
text = "The European Union is headquartered in Brussels"
result = predict(model, text, word2idx, pos2idx, chunk2idx, idx2tag, max_len=50, device=device)
print(result)

[('The', 'O'), ('European', 'I-MISC'), ('Union', 'O'), ('is', 'O'), ('headquartered', 'O'), ('in', 'O'), ('Brussels', 'O')]
