# Assignment 2: Named Entity Recognition (NER)

This notebook implements and compares two traditional NER approaches on a benchmark dataset.

## Goals
- Choose a benchmark NER dataset (WikiAnn English)
- Train a CRF model with handcrafted features
- Train a BiLSTM model as a neural baseline
- Tune hyperparameters on validation data
- Evaluate with entity-level precision, recall, and F1
- Analyze common error patterns


In [1]:
# Install dependencies (run once)
!pip -q install datasets seqeval sklearn-crfsuite torch tqdm pandas

In [None]:
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from datasets import load_dataset, load_from_disk
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

import sklearn_crfsuite
from sklearn_crfsuite import metrics as crf_metrics

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x112eb6db0>

In [None]:
# Load WikiAnn English from local disk (fallback to Hugging Face if missing)
local_path = Path("Assignment2_Name_Entity_Recognition/wikiann-en")
if local_path.exists():
    raw = load_from_disk(str(local_path))
    dataset_name = "wikiann-en (local)"
else:
    raw = load_dataset("wikiann", "en")
    dataset_name = "wikiann-en"

label_list = raw["train"].features["ner_tags"].feature.names
pos_list = raw["train"].features["pos_tags"].feature.names if "pos_tags" in raw["train"].features else None

print("Dataset:", dataset_name)
print("Labels:", label_list)
print("Train size:", len(raw["train"]))
print("Validation size:", len(raw["validation"]))
print("Test size:", len(raw["test"]))

# Convert a record to (token, pos) sequence and label sequence
def to_sent(record):
    tokens = record["tokens"]
    if pos_list is None:
        pos = ["X"] * len(tokens)
    else:
        pos = [pos_list[i] for i in record["pos_tags"]]
    labels = [label_list[i] for i in record["ner_tags"]]
    return list(zip(tokens, pos)), labels

train_sents = [to_sent(r) for r in raw["train"]]
val_sents = [to_sent(r) for r in raw["validation"]]
test_sents = [to_sent(r) for r in raw["test"]]

Generating validation split: 100%|██████████| 10000/10000 [00:00<00:00, 658374.12 examples/s]
Generating test split: 100%|██████████| 10000/10000 [00:00<00:00, 928580.22 examples/s]
Generating train split: 100%|██████████| 20000/20000 [00:00<00:00, 850702.58 examples/s]


Dataset: wikiann-en
Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
Train size: 20000
Validation size: 10000
Test size: 10000


In [6]:
# CRF feature extraction

def word2features(sent, i):
    word, pos = sent[i]
    features = {
        "bias": 1.0,
        "word.lower": word.lower(),
        "word.isupper": word.isupper(),
        "word.istitle": word.istitle(),
        "word.isdigit": word.isdigit(),
        "pos": pos,
        "suffix3": word[-3:],
        "suffix2": word[-2:],
        "prefix1": word[:1],
        "prefix2": word[:2],
    }

    if i > 0:
        prev_word, prev_pos = sent[i - 1]
        features.update({
            "-1:word.lower": prev_word.lower(),
            "-1:word.istitle": prev_word.istitle(),
            "-1:word.isupper": prev_word.isupper(),
            "-1:pos": prev_pos,
        })
    else:
        features["BOS"] = True

    if i < len(sent) - 1:
        next_word, next_pos = sent[i + 1]
        features.update({
            "+1:word.lower": next_word.lower(),
            "+1:word.istitle": next_word.istitle(),
            "+1:word.isupper": next_word.isupper(),
            "+1:pos": next_pos,
        })
    else:
        features["EOS"] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(labels):
    return labels

X_train = [sent2features(s) for s, _ in train_sents]
y_train = [sent2labels(l) for _, l in train_sents]
X_val = [sent2features(s) for s, _ in val_sents]
y_val = [sent2labels(l) for _, l in val_sents]
X_test = [sent2features(s) for s, _ in test_sents]
y_test = [sent2labels(l) for _, l in test_sents]

# Hyperparameter tuning for CRF
labels_no_o = [l for l in label_list if l != "O"]

crf_params = [
    {"c1": 0.01, "c2": 0.01},
    {"c1": 0.1, "c2": 0.01},
    {"c1": 0.1, "c2": 0.1},
    {"c1": 0.5, "c2": 0.1},
    {"c1": 0.5, "c2": 0.5},
]

best_crf = None
best_f1 = -1.0
for p in crf_params:
    crf = sklearn_crfsuite.CRF(
        algorithm="lbfgs",
        c1=p["c1"],
        c2=p["c2"],
        max_iterations=100,
        all_possible_transitions=True,
    )
    crf.fit(X_train, y_train)
    val_pred = crf.predict(X_val)
    f1 = crf_metrics.flat_f1_score(
        y_val, val_pred, average="weighted", labels=labels_no_o
    )
    print("CRF params", p, "val F1", round(f1, 4))
    if f1 > best_f1:
        best_f1 = f1
        best_crf = crf

print("Best CRF val F1:", round(best_f1, 4))

# Evaluate CRF on test
crf_test_pred = best_crf.predict(X_test)
print("CRF test metrics")
print(classification_report(y_test, crf_test_pred, digits=4))
crf_test_f1 = f1_score(y_test, crf_test_pred)
crf_test_p = precision_score(y_test, crf_test_pred)
crf_test_r = recall_score(y_test, crf_test_pred)
print("CRF test P/R/F1:", round(crf_test_p, 4), round(crf_test_r, 4), round(crf_test_f1, 4))

CRF params {'c1': 0.01, 'c2': 0.01} val F1 0.7665
CRF params {'c1': 0.1, 'c2': 0.01} val F1 0.7727
CRF params {'c1': 0.1, 'c2': 0.1} val F1 0.7776
CRF params {'c1': 0.5, 'c2': 0.1} val F1 0.7766
CRF params {'c1': 0.5, 'c2': 0.5} val F1 0.7717
Best CRF val F1: 0.7776
CRF test metrics
              precision    recall  f1-score   support

         LOC     0.7070    0.7275    0.7171      4657
         ORG     0.6469    0.5633    0.6022      4745
         PER     0.7987    0.8071    0.8028      4556

   micro avg     0.7198    0.6977    0.7086     13958
   macro avg     0.7175    0.6993    0.7074     13958
weighted avg     0.7165    0.6977    0.7060     13958

CRF test P/R/F1: 0.7198 0.6977 0.7086


In [7]:
# BiLSTM model

# Build vocab from training data
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
PAD_TAG = "<PAD>"

word_counts = {}
for sent, _ in train_sents:
    for w, _ in sent:
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [PAD_TOKEN, UNK_TOKEN] + sorted(word_counts.keys())
word2id = {w: i for i, w in enumerate(vocab)}
id2word = {i: w for w, i in word2id.items()}

label_list_bilstm = label_list + [PAD_TAG]
label2id = {l: i for i, l in enumerate(label_list_bilstm)}
id2label = {i: l for l, i in label2id.items()}

pad_id = word2id[PAD_TOKEN]
pad_tag_id = label2id[PAD_TAG]

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, items, word2id, label2id):
        self.items = items
        self.word2id = word2id
        self.label2id = label2id

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        sent, labels = self.items[idx]
        tokens = [w for w, _ in sent]
        x = [self.word2id.get(w, self.word2id[UNK_TOKEN]) for w in tokens]
        y = [self.label2id[l] for l in labels]
        return {"tokens": tokens, "input_ids": x, "tag_ids": y}


def collate_batch(batch):
    max_len = max(len(b["input_ids"]) for b in batch)
    input_ids = []
    tag_ids = []
    lengths = []
    tokens = []
    for b in batch:
        l = len(b["input_ids"])
        lengths.append(l)
        tokens.append(b["tokens"])
        input_ids.append(b["input_ids"] + [pad_id] * (max_len - l))
        tag_ids.append(b["tag_ids"] + [pad_tag_id] * (max_len - l))
    return {
        "tokens": tokens,
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "tag_ids": torch.tensor(tag_ids, dtype=torch.long),
        "lengths": torch.tensor(lengths, dtype=torch.long),
    }


def build_loaders(batch_size):
    train_ds = NERDataset(train_sents, word2id, label2id)
    val_ds = NERDataset(val_sents, word2id, label2id)
    test_ds = NERDataset(test_sents, word2id, label2id)
    return (
        DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch),
        DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch),
        DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch),
    )


class BiLSTMTagger(nn.Module):
    def __init__(self, vocab_size, tag_size, embed_dim, hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim, batch_first=True, bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim * 2, tag_size)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        return self.classifier(x)


def eval_bilstm(model, loader, device):
    model.eval()
    all_true = []
    all_pred = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            lengths = batch["lengths"].to(device)
            logits = model(input_ids)
            preds = logits.argmax(-1).cpu().tolist()
            for i, pred_seq in enumerate(preds):
                length = lengths[i].item()
                true_seq = batch["tag_ids"][i][:length].tolist()
                all_true.append([id2label[t] for t in true_seq])
                all_pred.append([id2label[p] for p in pred_seq[:length]])
    return all_true, all_pred


def train_bilstm(config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, val_loader, _ = build_loaders(config["batch_size"])
    model = BiLSTMTagger(
        vocab_size=len(vocab),
        tag_size=len(label_list_bilstm),
        embed_dim=config["embed_dim"],
        hidden_dim=config["hidden_dim"],
        dropout=config["dropout"],
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
    loss_fn = nn.CrossEntropyLoss(ignore_index=pad_tag_id)

    for epoch in range(config["epochs"]):
        model.train()
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            tag_ids = batch["tag_ids"].to(device)
            logits = model(input_ids)
            loss = loss_fn(logits.view(-1, logits.size(-1)), tag_ids.view(-1))
            loss.backward()
            optimizer.step()

        val_true, val_pred = eval_bilstm(model, val_loader, device)
        val_f1 = f1_score(val_true, val_pred)
        print("Val F1:", round(val_f1, 4))

    return model

bilstm_configs = [
    {"embed_dim": 100, "hidden_dim": 128, "dropout": 0.2, "lr": 1e-3, "batch_size": 32, "epochs": 3},
    {"embed_dim": 100, "hidden_dim": 256, "dropout": 0.3, "lr": 5e-4, "batch_size": 32, "epochs": 3},
]

best_bilstm = None
best_bilstm_f1 = -1.0
best_config = None

for cfg in bilstm_configs:
    print("Training config:", cfg)
    model = train_bilstm(cfg)
    _, val_loader, _ = build_loaders(cfg["batch_size"])
    val_true, val_pred = eval_bilstm(
        model, val_loader, torch.device("cuda" if torch.cuda.is_available() else "cpu")
    )
    val_f1 = f1_score(val_true, val_pred)
    if val_f1 > best_bilstm_f1:
        best_bilstm_f1 = val_f1
        best_bilstm = model
        best_config = cfg

print("Best BiLSTM val F1:", round(best_bilstm_f1, 4))
print("Best config:", best_config)

# Evaluate BiLSTM on test
_, _, test_loader = build_loaders(best_config["batch_size"])
test_true, test_pred = eval_bilstm(
    best_bilstm, test_loader, torch.device("cuda" if torch.cuda.is_available() else "cpu")
)
print("BiLSTM test metrics")
print(classification_report(test_true, test_pred, digits=4))

bilstm_test_f1 = f1_score(test_true, test_pred)
bilstm_test_p = precision_score(test_true, test_pred)
bilstm_test_r = recall_score(test_true, test_pred)
print("BiLSTM test P/R/F1:", round(bilstm_test_p, 4), round(bilstm_test_r, 4), round(bilstm_test_f1, 4))

Training config: {'embed_dim': 100, 'hidden_dim': 128, 'dropout': 0.2, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}


Epoch 1: 100%|██████████| 625/625 [00:12<00:00, 51.11it/s]


Val F1: 0.381


Epoch 2: 100%|██████████| 625/625 [00:11<00:00, 54.06it/s]


Val F1: 0.4502


Epoch 3: 100%|██████████| 625/625 [00:11<00:00, 54.59it/s]


Val F1: 0.4678
Training config: {'embed_dim': 100, 'hidden_dim': 256, 'dropout': 0.3, 'lr': 0.0005, 'batch_size': 32, 'epochs': 3}


Epoch 1: 100%|██████████| 625/625 [00:17<00:00, 35.93it/s]


Val F1: 0.2794


Epoch 2: 100%|██████████| 625/625 [00:17<00:00, 35.60it/s]


Val F1: 0.3684


Epoch 3: 100%|██████████| 625/625 [00:17<00:00, 35.91it/s]


Val F1: 0.4162
Best BiLSTM val F1: 0.4678
Best config: {'embed_dim': 100, 'hidden_dim': 128, 'dropout': 0.2, 'lr': 0.001, 'batch_size': 32, 'epochs': 3}
BiLSTM test metrics
              precision    recall  f1-score   support

         LOC     0.3771    0.6618    0.4804      4657
         ORG     0.3800    0.4145    0.3965      4745
         PER     0.5924    0.4482    0.5103      4556

   micro avg     0.4222    0.5080    0.4611     13958
   macro avg     0.4498    0.5082    0.4624     13958
weighted avg     0.4484    0.5080    0.4617     13958

BiLSTM test P/R/F1: 0.4222 0.508 0.4611


In [8]:
# Compare model metrics
results_df = pd.DataFrame(
    [
        {"model": "CRF", "precision": crf_test_p, "recall": crf_test_r, "f1": crf_test_f1},
        {"model": "BiLSTM", "precision": bilstm_test_p, "recall": bilstm_test_r, "f1": bilstm_test_f1},
    ]
)
results_df

Unnamed: 0,model,precision,recall,f1
0,CRF,0.71984,0.697664,0.708579
1,BiLSTM,0.422184,0.508024,0.461143


In [9]:
# Error analysis on CRF predictions

def collect_errors(tokens_list, y_true, y_pred, max_examples=20):
    errors = []
    type_counts = {}
    for tokens, true_seq, pred_seq in zip(tokens_list, y_true, y_pred):
        for tok, t, p in zip(tokens, true_seq, pred_seq):
            if t != p:
                errors.append((tok, t, p))
                t_type = t.split("-")[-1] if "-" in t else t
                type_counts[t_type] = type_counts.get(t_type, 0) + 1
    return errors[:max_examples], type_counts

# Use CRF outputs for analysis
crf_tokens_test = [[w for w, _ in sent] for sent, _ in test_sents]
error_examples, error_counts = collect_errors(crf_tokens_test, y_test, crf_test_pred)

print("Top error types:")
print(sorted(error_counts.items(), key=lambda x: x[1], reverse=True)[:10])

print("Sample misclassified tokens:")
for tok, t, p in error_examples:
    print(f"{tok:>15}  true={t:>7}  pred={p:>7}")

Top error types:
[('ORG', 4315), ('LOC', 2382), ('PER', 2051), ('O', 1606)]
Sample misclassified tokens:
            Eli  true=  B-ORG  pred=  B-PER
          Stone  true=  I-ORG  pred=  I-PER
             in  true=  I-ORG  pred=      O
            the  true=  I-ORG  pred=      O
         United  true=  I-ORG  pred=  B-LOC
         States  true=  I-ORG  pred=  I-LOC
          Willy  true=  B-PER  pred=      O
          Unger  true=  I-PER  pred=      O
        Palermo  true=  B-ORG  pred=  B-PER
      Cathedral  true=  I-ORG  pred=  I-PER
           List  true=  B-PER  pred=  B-ORG
             of  true=  I-PER  pred=  I-ORG
            The  true=  I-PER  pred=  I-ORG
           O.C.  true=  I-PER  pred=  I-ORG
     characters  true=  I-PER  pred=  I-ORG
           Jack  true=  B-ORG  pred=  B-PER
            and  true=  I-ORG  pred=  I-PER
          Diane  true=  I-ORG  pred=  I-PER
        2008–09  true=  B-ORG  pred=  B-LOC
       Beşiktaş  true=  I-ORG  pred=  I-LOC


## Notes for the report
- Record the best validation and test metrics from both models.
- Summarize common error types and give plausible reasons.
- Mention limitations (no character-level features, small hyperparameter sweep).
- Suggest future work (BiLSTM-CRF, pretrained embeddings, contextual encoders).