In [None]:
# Build a Named Entity Recognition (NER) system
# Measure Accuracy, Precision, Recall, and F1-score


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
# Sample real-world (news-style) NER dataset
sentences = [
    ["John", "lives", "in", "New", "York"],
    ["Apple", "was", "founded", "by", "Steve", "Jobs"],
    ["India", "won", "the", "cricket", "match"]
]

labels = [
    ["B-PER", "O", "O", "B-LOC", "I-LOC"],
    ["B-ORG", "O", "O", "O", "B-PER", "I-PER"],
    ["B-LOC", "O", "O", "O", "O"]
]


In [None]:
# Create word and tag vocabularies
word2idx = {"<PAD>": 0}
tag2idx = {"O": 0}

for sent in sentences:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

for tag_seq in labels:
    for tag in tag_seq:
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}


In [None]:
# Custom Dataset class
class NERDataset(Dataset):
    def __init__(self, sentences, labels):
        self.X = [[word2idx[w] for w in sent] for sent in sentences]
        self.y = [[tag2idx[t] for t in tag_seq] for tag_seq in labels]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])


In [None]:
# NER Model using BiLSTM
class NERModel(nn.Module):
    def __init__(self, vocab_size, tag_size, embed_dim=64, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, tag_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        return self.fc(x)


In [None]:
# Training the model
dataset = NERDataset(sentences, labels)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

model = NERModel(len(word2idx), len(tag2idx))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    for X, y in loader:
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs.view(-1, len(tag2idx)), y.view(-1))
        loss.backward()
        optimizer.step()

print("Training completed")


In [None]:
# Evaluation: Accuracy, Precision, Recall, F1-score
model.eval()
true_labels = []
pred_labels = []

with torch.no_grad():
    for X, y in loader:
        outputs = model(X)
        predictions = torch.argmax(outputs, dim=-1)

        true_labels.extend(y.view(-1).tolist())
        pred_labels.extend(predictions.view(-1).tolist())

accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(
    true_labels, pred_labels, average="macro"
)

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1 Score :", f1)
