In [21]:
# ! pip install -U datasets torch transformers
# ! pip install torchcrf


In [22]:
from datasets import load_dataset
from collections import Counter
from itertools import chain

dataset = load_dataset("surrey-nlp/PLOD-CW-25")
train_data = dataset["train"]
test_data = dataset["test"]


In [36]:
all_tokens = list(chain(*train_data["tokens"]))
all_tags = list(chain(*train_data["ner_tags"]))

token2idx = {tok: i+2 for i, tok in enumerate(set(all_tokens))}
token2idx["<PAD>"] = 0
token2idx["<UNK>"] = 1

tag_set = sorted(set(all_tags))
tag2idx = {tag: i for i, tag in enumerate(tag_set)}
idx2tag = {i: tag for tag, i in tag2idx.items()}


In [37]:
def encode_and_pad(data, pad_token_id=0, pad_label_id=-100):
    inputs, labels = [], []
    for tokens, tags in zip(data["tokens"], data["ner_tags"]):
        input_ids = [token2idx.get(tok, token2idx["<UNK>"]) for tok in tokens]
        label_ids = [tag2idx[tag] for tag in tags]
        inputs.append(input_ids)
        labels.append(label_ids)

    max_len = max(len(seq) for seq in inputs)
    inputs = [seq + [pad_token_id]*(max_len - len(seq)) for seq in inputs]
    labels = [seq + [pad_label_id]*(max_len - len(seq)) for seq in labels]
    return inputs, labels

train_inputs, train_labels = encode_and_pad(train_data)
test_inputs, test_labels = encode_and_pad(test_data)


In [38]:
from torch.utils.data import Dataset, DataLoader
import torch

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_inputs, train_labels)
test_dataset = NERDataset(test_inputs, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [39]:
import torch.nn as nn
from sklearn.metrics import f1_score
import time

class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=32, hidden_dim=64):
        super(BiLSTMTagger, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, input_ids):
        embeds = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeds)
        logits = self.fc(lstm_out)
        return logits

model = BiLSTMTagger(len(token2idx), len(tag2idx))
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)


In [40]:
def evaluate(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"]
            labels = batch["labels"]
            logits = model(input_ids)
            preds = torch.argmax(logits, dim=-1)
            for p, l in zip(preds, labels):
                p = p.cpu().numpy()
                l = l.cpu().numpy()
                valid = l != -100
                all_preds.extend(p[valid])
                all_labels.extend(l[valid])
    model.train()
    return f1_score(all_labels, all_preds, average="macro")


In [41]:
for epoch in range(20):
    start_time = time.time()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        labels = batch["labels"]

        logits = model(input_ids)
        loss = loss_fn(logits.view(-1, len(tag2idx)), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    test_f1 = evaluate(model, test_loader)
    duration = time.time() - start_time
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, Test F1: {test_f1:.4f}, Time: {duration:.2f}s")


Epoch 1, Loss: 56.3071, Test F1: 0.2198, Time: 12.94s
Epoch 2, Loss: 37.6807, Test F1: 0.5458, Time: 8.76s
Epoch 3, Loss: 29.0124, Test F1: 0.6614, Time: 6.32s
Epoch 4, Loss: 24.4965, Test F1: 0.7166, Time: 5.54s
Epoch 5, Loss: 21.4510, Test F1: 0.7089, Time: 6.37s
Epoch 6, Loss: 18.8487, Test F1: 0.7322, Time: 5.57s
Epoch 7, Loss: 16.6470, Test F1: 0.7243, Time: 6.36s
Epoch 8, Loss: 14.7942, Test F1: 0.7561, Time: 5.53s
Epoch 9, Loss: 13.0917, Test F1: 0.7417, Time: 6.38s
Epoch 10, Loss: 11.4056, Test F1: 0.7392, Time: 5.59s
Epoch 11, Loss: 9.9521, Test F1: 0.7323, Time: 6.16s
Epoch 12, Loss: 8.6873, Test F1: 0.7421, Time: 5.72s
Epoch 13, Loss: 7.3983, Test F1: 0.7284, Time: 5.84s
Epoch 14, Loss: 6.3146, Test F1: 0.7425, Time: 6.07s
Epoch 15, Loss: 5.4538, Test F1: 0.7260, Time: 6.63s
Epoch 16, Loss: 4.7362, Test F1: 0.7450, Time: 6.16s
Epoch 17, Loss: 4.0168, Test F1: 0.7313, Time: 5.59s
Epoch 18, Loss: 3.4042, Test F1: 0.7362, Time: 6.34s
Epoch 19, Loss: 2.9285, Test F1: 0.7403, Tim

In [29]:
# ! pip install -U TorchCRF

In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from TorchCRF import CRF
from sklearn.metrics import f1_score
import time

token2idx = {tok: i+2 for i, tok in enumerate(set(all_tokens))}
token2idx["<PAD>"] = 0
token2idx["<UNK>"] = 1

tag_set  = sorted(set(all_tags))
tag2idx  = {tag: i for i, tag in enumerate(tag_set)}
idx2tag  = {i: tag for tag, i in tag2idx.items()}

PAD_TOKEN_ID = token2idx["<PAD>"]
PAD_LABEL_ID = tag2idx["O"]

train_inputs, train_labels = encode_and_pad(train_data, PAD_TOKEN_ID, PAD_LABEL_ID)
test_inputs,  test_labels  = encode_and_pad(test_data,  PAD_TOKEN_ID, PAD_LABEL_ID)

class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels    = labels

    def __getitem__(self, idx):
        return {
          "input_ids": torch.tensor(self.encodings[idx], dtype=torch.long),
          "labels":    torch.tensor(self.labels[idx],    dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

train_loader = DataLoader(NERDataset(train_inputs, train_labels), batch_size=32, shuffle=True)
test_loader  = DataLoader(NERDataset(test_inputs,  test_labels),  batch_size=32)

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size,
                 embedding_dim=32, hidden_dim=64,
                 pad_token_id=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,
                                      padding_idx=pad_token_id)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim // 2,
                            num_layers=1,
                            bidirectional=True,
                            batch_first=True)
        self.fc  = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size)

    def _get_emissions(self, input_ids):
        x, _ = self.lstm(self.embedding(input_ids))
        return self.fc(x)

    def forward(self, input_ids):
        emissions = self._get_emissions(input_ids)
        mask      = (input_ids != self.embedding.padding_idx)
        return self.crf.viterbi_decode(emissions, mask)

    def loss(self, input_ids, tags):
        emissions = self._get_emissions(input_ids)
        mask      = (input_ids != self.embedding.padding_idx)
        log_likelihood = self.crf.forward(emissions, tags, mask)
        return -log_likelihood.mean()


def evaluate(model, loader):
    model.eval()
    preds_all, labels_all = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"]
            labels    = batch["labels"]
            batch_preds = model(input_ids)
            for pred_seq, label_seq, inp_seq in zip(batch_preds,
                                                    labels,
                                                    input_ids):
                valid_len = (inp_seq != PAD_TOKEN_ID).sum().item()
                preds_all.extend(pred_seq[:valid_len])
                labels_all.extend(label_seq[:valid_len].tolist())
    model.train()
    return f1_score(labels_all, preds_all, average="macro")



In [34]:
crf_model = BiLSTM_CRF(len(token2idx), len(tag2idx), pad_token_id=PAD_TOKEN_ID)
optimizer = torch.optim.Adam(crf_model.parameters(), lr=2e-3)

for epoch in range(1, 21):
    total_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        loss = crf_model.loss(batch["input_ids"], batch["labels"])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    test_f1 = evaluate(crf_model, test_loader)
    print(f"Epoch {epoch:2d} | Loss: {total_loss:.4f} | Test F1: {test_f1:.4f}")


Epoch  1 | Loss: 2238.0042 | Test F1: 0.2652
Epoch  2 | Loss: 1492.8622 | Test F1: 0.5512
Epoch  3 | Loss: 1132.5842 | Test F1: 0.6690
Epoch  4 | Loss: 946.1746 | Test F1: 0.7211
Epoch  5 | Loss: 821.9778 | Test F1: 0.7436
Epoch  6 | Loss: 715.2990 | Test F1: 0.7514
Epoch  7 | Loss: 629.1771 | Test F1: 0.7623
Epoch  8 | Loss: 548.8198 | Test F1: 0.7655
Epoch  9 | Loss: 480.3982 | Test F1: 0.7662
Epoch 10 | Loss: 417.0347 | Test F1: 0.7686
Epoch 11 | Loss: 358.6615 | Test F1: 0.7634
Epoch 12 | Loss: 309.1900 | Test F1: 0.7620
Epoch 13 | Loss: 263.8379 | Test F1: 0.7580
Epoch 14 | Loss: 223.4335 | Test F1: 0.7615
Epoch 15 | Loss: 191.0393 | Test F1: 0.7540
Epoch 16 | Loss: 162.8920 | Test F1: 0.7470
Epoch 17 | Loss: 138.7516 | Test F1: 0.7484
Epoch 18 | Loss: 128.8066 | Test F1: 0.7475
Epoch 19 | Loss: 102.0246 | Test F1: 0.7496
Epoch 20 | Loss: 84.0187 | Test F1: 0.7464
