### Import Libraries

In [397]:
# use for file handling, downloading and extracting data
import os
import requests
import zipfile
# data processing and numerical computations
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from collections import defaultdict

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# TensorFlow Keras, use pad_sequences to ensure uniform input sizes
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Data Downloading

#### Upload dataset to Google Drive, then download, extract and load data

In [398]:
# download file from Google Drive link and extract it
def download_and_extract_zip(drive_link, save_path="dataset.zip", extract_to="dataset"):

    # get the file ID from the Google Drive link
    file_id = drive_link.split("/d/")[1].split("/")[0]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # download the file
    response = requests.get(download_url, stream=True)
    if response.status_code == 200:
        with open(save_path, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print("Download complete")

    # extract the ZIP file
    with zipfile.ZipFile(save_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Extraction complete")

In [399]:
# load data from the extracted file, handling different encodings
def load_data(file_path):

    encodings = ["utf-8", "ISO-8859-1", "windows-1252"]

    for enc in encodings:
        try:
            with open(file_path, "r", encoding=enc) as file:
                lines = file.readlines()
            print(f"File loaded successfully using encoding: {enc}")
            return lines
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: {enc}")

    raise ValueError("Unable to read file")


In [400]:
# get Google Drive link, extract and take the dataset
drive_link = "https://drive.google.com/file/d/1AhESU6mPPW_UeRG4y2ojlAuH8Vyf4NlB/view?usp=sharing"
download_and_extract_zip(drive_link)

# load dataset
dataset_path = "dataset/GMB_dataset.txt"
data = load_data(dataset_path)

Download complete
Extraction complete
Failed to load with encoding: utf-8
File loaded successfully using encoding: ISO-8859-1


### Data Preprocessing

In [401]:
with open("dataset/GMB_dataset.txt", "r", encoding="ISO-8859-1") as file:
    lines = file.readlines()

# split lines into columns
data = [line.strip().split() for line in lines if line.strip()]  # remove empty lines

# convert data to DataFrame
df = pd.DataFrame(data, columns=["Index", "Sentence #", "Word", "POS", "Tag"])

# remove the first row as the header row from file
df = df.iloc[1:].reset_index(drop=True)

# check format
df

Unnamed: 0,Index,Sentence #,Word,POS,Tag
0,0,1.0,Thousands,NNS,O
1,1,1.0,of,IN,O
2,2,1.0,demonstrators,NNS,O
3,3,1.0,have,VBP,O
4,4,1.0,marched,VBN,O
...,...,...,...,...,...
66156,66156,2999.0,be,VB,O
66157,66157,2999.0,announced,VBN,O
66158,66158,2999.0,within,IN,B-tim
66159,66159,2999.0,days,NNS,O


In [402]:
# print unique tag
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', None, 'B-nat',
       'B-eve', 'I-eve', 'I-nat'], dtype=object)

In [403]:
print(df.isnull().sum())  # check if any columns have missing values

# fill missing values
df = df.fillna(method = 'ffill')

# check for missing values after processing
print("Missing values per column:\n", df.isnull().sum())

# print few head rows
print(df.head())

Index         0
Sentence #    0
Word          0
POS           0
Tag           1
dtype: int64
Missing values per column:
 Index         0
Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64
  Index Sentence #           Word  POS Tag
0     0        1.0      Thousands  NNS   O
1     1        1.0             of   IN   O
2     2        1.0  demonstrators  NNS   O
3     3        1.0           have  VBP   O
4     4        1.0        marched  VBN   O


  df = df.fillna(method = 'ffill')


In [404]:
# group words and tags by sentence
sentence_dict = defaultdict(list)
for _, row in df.iterrows():
    sentence_id = row["Sentence #"]
    sentence_dict[sentence_id].append((row["Word"], row["Tag"]))

In [405]:
# extract aligned sentences and tags
sentences, tags = [], []
for sentence_id, values in sentence_dict.items():
    words, ner_tags = zip(*values)  # unpack word-tag pairs
    sentences.append(list(words))
    tags.append(list(ner_tags))

# check alignment
assert len(sentences) == len(tags), "Sentences and tags mismatch!"
print(f"Total sentences: {len(sentences)} (Tags: {len(tags)})")

Total sentences: 2999 (Tags: 2999)


In [406]:
# create vocabulary mappings

# Word vocabulary
all_words = set(word for sentence in sentences for word in sentence)
word_to_index = {
    "PAD": 0,
    "UNK": 1,
    **{word: idx+2 for idx, word in enumerate(all_words)}
}

# Tag vocabulary
all_tags = set(tag for tag_list in tags for tag in tag_list)
tag_to_index = {
    "PAD": 0,
    **{tag: idx+1 for idx, tag in enumerate(all_tags)}
}

In [407]:
# convert to numerical indices
X = [
    [word_to_index.get(word, word_to_index["UNK"]) for word in sentence]
    for sentence in sentences
]
y = [
    [tag_to_index[tag] for tag in tag_list]
    for tag_list in tags
]

In [408]:
# split dataset before padding
# use last 10% of data as test set
split_idx = int(0.9 * len(sentences))  # 90% for training+validation, 10% for test
X_train_val, X_test_val = X[:split_idx], X[split_idx:]
y_train_val, y_test_val = y[:split_idx], y[split_idx:]
# X_test_val, y_test_val: the last 10% of sentences

X_train, X_test, y_train, y_test = train_test_split(
    X_train_val, y_train_val,
    test_size=0.2,
    random_state=42, shuffle = False
)

In [409]:
# pad sequences after splitting
max_len = max(len(s) for s in X_train)  # set max length

# padding="post": pad at the end of sequences
# value=word_to_index["PAD"]: use the padding token's index for padding

X_train = pad_sequences(
    X_train, maxlen=max_len, padding="post", value=word_to_index["PAD"]
)
X_test = pad_sequences(
    X_test, maxlen=max_len, padding="post", value=word_to_index["PAD"]
)

# pad the tag sequences in y_train to match the length of the input sequences
y_train = pad_sequences(
    y_train, maxlen=max_len, padding="post", value=tag_to_index["PAD"]
)
y_test = pad_sequences(
    y_test, maxlen=max_len, padding="post", value=tag_to_index["PAD"]
)

In [410]:
# check shapes and indices
print("\nFinal shapes:")
X_train_val = pad_sequences(X_train_val, padding='post', maxlen=100)
X_test_val = pad_sequences(X_test_val, padding='post', maxlen=100)
y_train_val = pad_sequences(y_train_val, padding='post', maxlen=100)
y_test_val = pad_sequences(y_test_val, padding='post', maxlen=100)

X_train_val = np.array(X_train_val)
y_train_val = np.array(y_train_val)
X_test_val = np.array(X_test_val)
y_test_val = np.array(y_test_val)

print(f"X_train_val: {X_train_val.shape}, y_train_val: {y_train_val.shape}")
print(f"X_test_val: {X_test_val.shape}, y_test_val: {y_test_val.shape}")

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

print("\nSample sentence:", sentences[0])

print("Mapped indices:", X_train[0])
print("Sample tags:", tags[0])
print("Mapped tags:", y_train[0])


Final shapes:
X_train_val: (2699, 100), y_train_val: (2699, 100)
X_test_val: (300, 100), y_test_val: (300, 100)
X_train: (2159, 62), y_train: (2159, 62)
X_test: (540, 62), y_test: (540, 62)

Sample sentence: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
Mapped indices: [3186 3670 5078  653 7384 4796 3407  561 8688 2447  504 1585 3996 2037
  735 2447 1540 3670 2275 4910 1258 7986 2617 1315    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]
Sample tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
Mapped tags: [ 1  1  1  1  1  1 10  1  1  1  1  1 10  1  1  1  1  1 11  1  1  1  1  1
  0  0  

### Build Model

#### Bidirectional LSTM-CRF Models for Sequence Tagging

In [411]:
!pip install pytorch-crf



In [412]:
from torchcrf import CRF

# define the BiLSTM-CRF model class
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super().__init__()
        # embedding layer to convert word indices to word embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # LSTM layer: processes the embedded input sequences
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim // 2,  # bidirectional split
            bidirectional=True,
            batch_first=True
        )

        # fully connected layer: maps the LSTM output to tag predictions
        self.fc = nn.Linear(hidden_dim, num_tags)

        # CRF layer: performs sequence labeling with a CRF layer on top of LSTM output
        self.crf = CRF(num_tags, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        # forward pass through embedding layer
        x = self.embedding(x)

        x, _ = self.lstm(x)
        # pass the LSTM output through the fully connected layer
        emissions = self.fc(x)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask)
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)

In [413]:
# intialize model with hyperparameter
vocab_size = len(word_to_index)  # from preprocessing
num_tags = len(tag_to_index)     # from preprocessing
model = BiLSTM_CRF(
    vocab_size=vocab_size,
    embedding_dim=100,
    hidden_dim=50,
    num_tags=num_tags
)

### Train Model

#### Prepare Data

In [414]:
from torchcrf import CRF

# convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# create mask tensor (1 for real tokens, 0 for padding)
mask = (X_train_tensor != word_to_index["PAD"]).bool()

# create dataset and dataloader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor, mask)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [415]:
# define optimizer
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

# training loop
num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (inputs, targets, mask_batch) in enumerate(train_loader):
        # zero gradients
        optimizer.zero_grad()

        # forward pass
        loss = model(inputs, tags=targets, mask=mask_batch)

        # backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # batch progress
        if batch_idx % 10 == 0:
            print(f'Epoch: {epoch+1:02} | Batch: {batch_idx:03} | Loss: {loss.item():.3f}')

    # print epoch statistics
    avg_loss = total_loss / len(train_loader)
    print(f'\nEpoch: {epoch+1:02} | Average Loss: {avg_loss:.3f}\n')


Epoch: 01 | Batch: 000 | Loss: 1779.817
Epoch: 01 | Batch: 010 | Loss: 508.060
Epoch: 01 | Batch: 020 | Loss: 492.444
Epoch: 01 | Batch: 030 | Loss: 476.062
Epoch: 01 | Batch: 040 | Loss: 379.368
Epoch: 01 | Batch: 050 | Loss: 336.338
Epoch: 01 | Batch: 060 | Loss: 440.214

Epoch: 01 | Average Loss: 600.885

Epoch: 02 | Batch: 000 | Loss: 446.505
Epoch: 02 | Batch: 010 | Loss: 318.414
Epoch: 02 | Batch: 020 | Loss: 404.325
Epoch: 02 | Batch: 030 | Loss: 317.095
Epoch: 02 | Batch: 040 | Loss: 326.448
Epoch: 02 | Batch: 050 | Loss: 307.499
Epoch: 02 | Batch: 060 | Loss: 442.742

Epoch: 02 | Average Loss: 387.846

Epoch: 03 | Batch: 000 | Loss: 347.380
Epoch: 03 | Batch: 010 | Loss: 295.722
Epoch: 03 | Batch: 020 | Loss: 370.117
Epoch: 03 | Batch: 030 | Loss: 286.520
Epoch: 03 | Batch: 040 | Loss: 353.608
Epoch: 03 | Batch: 050 | Loss: 271.258
Epoch: 03 | Batch: 060 | Loss: 272.825

Epoch: 03 | Average Loss: 324.130

Epoch: 04 | Batch: 000 | Loss: 297.850
Epoch: 04 | Batch: 010 | Loss: 26

In [416]:
# save model
torch.save(model.state_dict(), "bilstm_crf_ner_model.pt")

In [417]:
# predict function
def predict(sentence, model, word_to_index, tag_to_index):
    # convert sentence to indices
    indexed_sentence = [word_to_index.get(word, word_to_index["UNK"]) for word in sentence]

    # convert to tensor and add batch dimension
    inputs = torch.tensor([indexed_sentence], dtype=torch.long)
    mask = (inputs != word_to_index["PAD"]).bool()

    # predice
    model.eval()
    with torch.no_grad():
        tags = model(inputs, mask=mask)

    # convert indices to tags
    index_to_tag = {v: k for k, v in tag_to_index.items()}
    return [(word, index_to_tag[tag]) for word, tag in zip(sentence, tags[0])]

In [418]:
# convert test data to tensors
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# check
test_mask = (X_test_tensor != word_to_index["PAD"]).bool()

### Evaluation

##### Use Precision, Recall and F1 to evaluate model

In [419]:
# import necessary metrics for evaluation
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_model(model, dataloader, tag_to_index):
    model.eval()  # set the model to evaluation mode
    total_correct = 0  # store the count of correctly predicted tokens
    total_tokens = 0  # store the total number of tokens

    # map from index to tag
    index_to_tag = {v: k for k, v in tag_to_index.items()}

    # list to store all predictions and true tags
    all_preds = []
    all_true = []

    with torch.no_grad():
        for inputs, targets, mask in dataloader:
            # get predictions from the model (list of lists)
            preds = model(inputs, mask=mask)

            # iterate through each sample in the batch
            for i in range(inputs.size(0)):
                # mask for current sample to handle padding
                sample_mask = mask[i]

                # true tags for the current sample (filtered by mask)
                true_tags = targets[i][sample_mask]

                # predicted tags (converted from list to tensor)
                pred_tags = torch.tensor(preds[i][:len(true_tags)], device=inputs.device)

                # calculate correct predictions (count matching tags)
                total_correct += (pred_tags == true_tags).sum().item()
                total_tokens += len(true_tags)

                # collect predictions and true tags for precision, recall, and F1 score calculation
                all_preds.extend(pred_tags.cpu().numpy())
                all_true.extend(true_tags.cpu().numpy())

    # calculate token accuracy: correct predictions / total tokens
    accuracy = total_correct / total_tokens
    print(f"Token Accuracy: {accuracy:.4f}")

    # compute Precision, Recall, and F1 Score with weighted average

    precision = precision_score(all_true, all_preds, average='weighted', zero_division=1)
    recall = recall_score(all_true, all_preds, average='weighted', zero_division=1)
    f1 = f1_score(all_true, all_preds, average='weighted', zero_division=1)

    # print the result
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, precision, recall, f1  # return the calculated metrics


In [420]:
# ensure test_loader is properly defined:
test_loader = DataLoader(
    TensorDataset(X_test_tensor, y_test_tensor, test_mask),
    batch_size=64
)

# run evaluation
results = evaluate_model(model, test_loader, tag_to_index)

Token Accuracy: 0.9204
Precision: 0.9146
Recall: 0.9204
F1 Score: 0.9148


In [421]:
# check an example
test_sentence = ["Apple", "Inc.", "is", "based", "in", "California"]
# apply the predict function
prediction = predict(test_sentence, model, word_to_index, tag_to_index)

# print results
print("Predicted Tags:")
for word, tag in prediction:
    print(f"{word}: {tag}")

Predicted Tags:
Apple: O
Inc.: O
is: O
based: O
in: O
California: B-geo
