### Import Libraries

In [None]:
import os
import requests
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
import torch
import torch.nn as nn
import torch.optim as optim
# from torchcrf import CRF
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np


### Data Downloading

In [None]:
# Download file from Google Drive link and extract it
def download_and_extract_zip(drive_link, save_path="dataset.zip", extract_to="dataset"):

    # get the file ID from the Google Drive link
    file_id = drive_link.split("/d/")[1].split("/")[0]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # download the file
    response = requests.get(download_url, stream=True)
    if response.status_code == 200:
        with open(save_path, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print("Download complete")

    # extract the ZIP file
    with zipfile.ZipFile(save_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Extraction complete")

In [None]:
# Load data from the extracted file, handling different encodings
def load_data(file_path):
    
    encodings = ["utf-8", "ISO-8859-1", "windows-1252"]

    for enc in encodings:
        try:
            with open(file_path, "r", encoding=enc) as file:
                lines = file.readlines()
            print(f"File loaded successfully using encoding: {enc}")
            return lines
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: {enc}")

    raise ValueError("Unable to read file")


In [None]:
# provide google drive link, extract and take the dataset
drive_link = "https://drive.google.com/file/d/1AhESU6mPPW_UeRG4y2ojlAuH8Vyf4NlB/view?usp=sharing"

download_and_extract_zip(drive_link)
dataset_path = "dataset/GMB_dataset.txt"

data = load_data(dataset_path)

Download complete.
Extraction complete.
Failed to load with encoding: utf-8, trying next...
File loaded successfully using encoding: ISO-8859-1


### Data Preprocessing

In [None]:
with open("dataset/GMB_dataset.txt", "r", encoding="ISO-8859-1") as file:
    lines = file.readlines()

# split lines into columns 
data = [line.strip().split() for line in lines if line.strip()]  # remove empty lines

# convert data to DataFrame
df = pd.DataFrame(data, columns=["Index", "Sentence #", "Word", "POS", "Tag"])

# remove the first row as the header row from file
df = df.iloc[1:].reset_index(drop=True)

# check
df

Unnamed: 0,Index,Sentence #,Word,POS,Tag
0,0,1.0,Thousands,NNS,O
1,1,1.0,of,IN,O
2,2,1.0,demonstrators,NNS,O
3,3,1.0,have,VBP,O
4,4,1.0,marched,VBN,O
...,...,...,...,...,...
66156,66156,2999.0,be,VB,O
66157,66157,2999.0,announced,VBN,O
66158,66158,2999.0,within,IN,B-tim
66159,66159,2999.0,days,NNS,O


In [None]:
# print unique tag
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', None, 'B-nat',
       'B-eve', 'I-eve', 'I-nat'], dtype=object)

In [None]:
print(df.isnull().sum())  # check if any columns have missing values

# fill missing values
df = df.fillna(method = 'ffill')

# check for missing values after processing
print("Missing values per column:\n", df.isnull().sum())

# print few head rows
print(df.head())

Index         0
Sentence #    0
Word          0
POS           0
Tag           1
dtype: int64
Missing values per column:
 Index         0
Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64
  Index Sentence #           Word  POS Tag
0     0        1.0      Thousands  NNS   O
1     1        1.0             of   IN   O
2     2        1.0  demonstrators  NNS   O
3     3        1.0           have  VBP   O
4     4        1.0        marched  VBN   O


  df = df.fillna(method = 'ffill')


In [None]:
# group words and tags by sentence
sentences = []
tags = []
sentence_dict = defaultdict(list)

for _, row in df.iterrows():
    sentence_dict[row["Sentence #"]].append((row["Word"], row["Tag"]))

# convert to list of sentences
for sentence_id, values in sentence_dict.items():
    words, ner_tags = zip(*values)
    sentences.append(list(words))
    tags.append(list(ner_tags))

# print the converted lists
print("\nTotal sentences:", len(sentences))
print("first 3 sentences and their tags:")
for i, (sentence, tag_list) in enumerate(zip(sentences[:3], tags[:3])):
    print(f"\nSentence {i+1}:", sentence)
    for word, tag in zip(sentence, tag_list):
        print(f"  Word: {word.ljust(15)} | Tag: {tag}")


Total sentences: 2999
First 3 sentences and their tags:

Sentence 1: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
  Word: Thousands       | Tag: O
  Word: of              | Tag: O
  Word: demonstrators   | Tag: O
  Word: have            | Tag: O
  Word: marched         | Tag: O
  Word: through         | Tag: O
  Word: London          | Tag: B-geo
  Word: to              | Tag: O
  Word: protest         | Tag: O
  Word: the             | Tag: O
  Word: war             | Tag: O
  Word: in              | Tag: O
  Word: Iraq            | Tag: B-geo
  Word: and             | Tag: O
  Word: demand          | Tag: O
  Word: the             | Tag: O
  Word: withdrawal      | Tag: O
  Word: of              | Tag: O
  Word: British         | Tag: B-gpe
  Word: troops          | Tag: O
  Word: from            | Tag: O
  Word

In [None]:
# check length of sentences and tags
print("Sentences count:", len(sentences))
print("Tags count:", len(tags))

Sentences count: 2999
Tags count: 2999


In [None]:
# get unique words and labels from data
words = list(df['Word'].unique())
unique_tags = list(df['Tag'].unique())
# dictionary word:index pair
# word is key and its value is corresponding index
word_to_index = {w : i + 2 for i, w in enumerate(words)}
word_to_index["UNK"] = 1
word_to_index["PAD"] = 0

# dictionary label:index pair
# label is key and value is index.
tag_to_index = {t : i + 1 for i, t in enumerate(unique_tags)}
tag_to_index["PAD"] = 0

idx2word = {i: w for w, i in word_to_index.items()}
idx2tag = {i: w for w, i in tag_to_index.items()}

In [None]:
# check all unique tags in your dataset
all_tags = set(tag for tag_list in tags for tag in tag_list)
print("Unique tags in dataset:", all_tags)

# Check keys in your tag_to_index dictionary
print("Keys in tag_to_index:", tag_to_index.keys())

Unique tags in dataset: {'B-per', 'I-tim', 'B-art', 'I-art', 'B-eve', 'I-gpe', 'I-eve', 'I-geo', 'B-tim', 'I-per', 'I-org', 'B-nat', 'I-nat', 'B-org', 'O', 'B-geo', 'B-gpe'}
Keys in tag_to_index: dict_keys(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat', 'PAD'])


In [None]:
all_tags = sorted(set(tag for tag_list in tags for tag in tag_list))
all_tags.append("PAD")  # add padding token
tag_to_index = {tag: idx for idx, tag in enumerate(all_tags)}

max_len = 100
# convert tags to indices
y = [[tag_to_index[tag] for tag in sentence_tags] for sentence_tags in tags]

# pad sequences
y = pad_sequences(
    maxlen=max_len,
    sequences=y,
    padding="post",
    value=tag_to_index["PAD"]
)

In [None]:
# convert sentences to word indices
X = [[word_to_index.get(word, word_to_index["UNK"]) for word in sentence] for sentence in sentences]

# convert tags to tag indices
y = [[tag_to_index[tag] for tag in tag_list] for tag_list in tags]

# check size
print("Number of samples in X:", len(X))
print("Number of samples in y:", len(y))

Number of samples in X: 2999
Number of samples in y: 2999


In [None]:
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Size of training input data : ", X_train.shape)
print("Size of training output data : ", np.array(y_train).shape)
print("Size of testing input data : ", X_test.shape)
print("Size of testing output data : ", np.array(y_test).shape)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2699,) + inhomogeneous part.

### Build Model

### Train Model

### Conclusion

In [49]:
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset
# df = pd.read_csv("your_data.csv")  # Uncomment if loading from file

# 1. Group words and tags by sentence
sentence_dict = defaultdict(list)
for _, row in df.iterrows():
    sentence_id = row["Sentence #"]
    sentence_dict[sentence_id].append((row["Word"], row["Tag"]))

# 2. Extract aligned sentences and tags
sentences, tags = [], []
for sentence_id, values in sentence_dict.items():
    words, ner_tags = zip(*values)  # Unpack word-tag pairs
    sentences.append(list(words))
    tags.append(list(ner_tags))

# Verify alignment
assert len(sentences) == len(tags), "Sentences and tags mismatch!"
print(f"Total sentences: {len(sentences)} (Tags: {len(tags)})")

# 3. Create vocabulary mappings
# Word vocabulary
all_words = set(word for sentence in sentences for word in sentence)
word_to_index = {
    "PAD": 0,
    "UNK": 1,
    **{word: idx+2 for idx, word in enumerate(all_words)}
}

# Tag vocabulary
all_tags = set(tag for tag_list in tags for tag in tag_list)
tag_to_index = {
    "PAD": 0,
    **{tag: idx+1 for idx, tag in enumerate(all_tags)}
}

# 4. Convert to numerical indices
X = [
    [word_to_index.get(word, word_to_index["UNK"]) for word in sentence]
    for sentence in sentences
]
y = [
    [tag_to_index[tag] for tag in tag_list]
    for tag_list in tags
]

# 5. Split dataset FIRST (before padding)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42
)

# 6. Pad sequences AFTER splitting
max_len = max(len(s) for s in X_train)  # Or set your preferred max length

X_train = pad_sequences(
    X_train, maxlen=max_len, padding="post", value=word_to_index["PAD"]
)
X_test = pad_sequences(
    X_test, maxlen=max_len, padding="post", value=word_to_index["PAD"]
)

y_train = pad_sequences(
    y_train, maxlen=max_len, padding="post", value=tag_to_index["PAD"]
)
y_test = pad_sequences(
    y_test, maxlen=max_len, padding="post", value=tag_to_index["PAD"]
)

# 7. Final verification
print("\nFinal shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print("\nSample sentence:", sentences[0])
print("Mapped indices:", X_train[0])
print("Sample tags:", tags[0])
print("Mapped tags:", y_train[0])

Total sentences: 2999 (Tags: 2999)

Final shapes:
X_train: (2699, 70), y_train: (2699, 70)
X_test: (300, 70), y_test: (300, 70)

Sample sentence: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
Mapped indices: [4058 8052 2420  777  896  667 4196 2186 1819 5455  985 1819 4638 6619
 5872 5131 7613 6750  804 7042 3476 4830   50 7028 6480 3892 1129  490
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
Sample tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
Mapped tags: [15 15 15 15 15 15 15 15 15  1 10 15 15 15 14 11 15 15  1 15 15 15 15 15
  9  2 15 15  0  0  0  0  0  0 

In [50]:
!pip install pytorch-crf  # Correct package name  # Run this in Google Colab or your environment



In [51]:
import torch
import torch.nn as nn
from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim // 2,  # Bidirectional split
            bidirectional=True,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, num_tags)
        self.crf = CRF(num_tags, batch_first=True)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        emissions = self.fc(x)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask)
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)

# Initialize model
vocab_size = len(word_to_index)  # From preprocessing
num_tags = len(tag_to_index)     # From preprocessing
model = BiLSTM_CRF(
    vocab_size=vocab_size,
    embedding_dim=100,
    hidden_dim=50,
    num_tags=num_tags
)

In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchcrf import CRF

# Assuming you have these from preprocessing
# X_train_padded, y_train_padded (numpy arrays)
# word_to_index, tag_to_index (dictionaries)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# Create mask tensor (1 for real tokens, 0 for padding)
mask = (X_train_tensor != word_to_index["PAD"]).bool()

# Create dataset and dataloader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor, mask)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define model parameters
vocab_size = len(word_to_index)
num_tags = len(tag_to_index)
embedding_dim = 100
hidden_dim = 50

# Initialize model, loss and optimizer
model = BiLSTM_CRF(vocab_size, embedding_dim, hidden_dim, num_tags)
optimizer = optim.RMSprop(model.parameters(), lr=0.001)

# Training loop
num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (inputs, targets, mask_batch) in enumerate(train_loader):
        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        loss = model(inputs, tags=targets, mask=mask_batch)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print batch progress
        if batch_idx % 10 == 0:
            print(f'Epoch: {epoch+1:02} | Batch: {batch_idx:03} | Loss: {loss.item():.3f}')

    # Print epoch statistics
    avg_loss = total_loss / len(train_loader)
    print(f'\nEpoch: {epoch+1:02} | Average Loss: {avg_loss:.3f}\n')

# Save model
torch.save(model.state_dict(), "bilstm_crf_ner_model.pt")

# Test prediction example
def predict(sentence, model, word_to_index, tag_to_index):
    # Convert sentence to indices
    indexed_sentence = [word_to_index.get(word, word_to_index["UNK"]) for word in sentence]

    # Convert to tensor and add batch dimension
    inputs = torch.tensor([indexed_sentence], dtype=torch.long)
    mask = (inputs != word_to_index["PAD"]).bool()

    # Predict
    model.eval()
    with torch.no_grad():
        tags = model(inputs, mask=mask)

    # Convert indices to tags
    index_to_tag = {v: k for k, v in tag_to_index.items()}
    return [(word, index_to_tag[tag]) for word, tag in zip(sentence, tags[0])]

# Test with sample input
test_sentence = ["EU", "rejects", "German", "call"]
print("\nSample prediction:", predict(test_sentence, model, word_to_index, tag_to_index))

Epoch: 01 | Batch: 000 | Loss: 1852.913
Epoch: 01 | Batch: 010 | Loss: 600.430
Epoch: 01 | Batch: 020 | Loss: 571.082
Epoch: 01 | Batch: 030 | Loss: 459.790
Epoch: 01 | Batch: 040 | Loss: 474.414
Epoch: 01 | Batch: 050 | Loss: 473.104
Epoch: 01 | Batch: 060 | Loss: 432.323
Epoch: 01 | Batch: 070 | Loss: 450.706
Epoch: 01 | Batch: 080 | Loss: 372.644

Epoch: 01 | Average Loss: 559.420

Epoch: 02 | Batch: 000 | Loss: 467.115
Epoch: 02 | Batch: 010 | Loss: 388.867
Epoch: 02 | Batch: 020 | Loss: 469.439
Epoch: 02 | Batch: 030 | Loss: 317.274
Epoch: 02 | Batch: 040 | Loss: 386.415
Epoch: 02 | Batch: 050 | Loss: 359.081
Epoch: 02 | Batch: 060 | Loss: 358.313
Epoch: 02 | Batch: 070 | Loss: 455.720
Epoch: 02 | Batch: 080 | Loss: 355.959

Epoch: 02 | Average Loss: 372.726

Epoch: 03 | Batch: 000 | Loss: 346.928
Epoch: 03 | Batch: 010 | Loss: 353.975
Epoch: 03 | Batch: 020 | Loss: 226.234
Epoch: 03 | Batch: 030 | Loss: 264.546
Epoch: 03 | Batch: 040 | Loss: 356.678
Epoch: 03 | Batch: 050 | Loss:

In [53]:
# Convert test data to tensors
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
test_mask = (X_test_tensor != word_to_index["PAD"]).bool()

# # Create test DataLoader
# test_dataset = TensorDataset(X_test_tensor, y_test_tensor, test_mask)
# test_loader = DataLoader(test_dataset, batch_size=32)

In [55]:
def evaluate(model, dataloader, tag_to_index):
    model.eval()
    total_correct = 0
    total_tokens = 0

    index_to_tag = {v: k for k, v in tag_to_index.items()}

    with torch.no_grad():
        for inputs, targets, mask in dataloader:
            # Get predictions (list of lists)
            preds = model(inputs, mask=mask)

            # Iterate through each sample in batch
            for i in range(inputs.size(0)):
                # Mask for current sample
                sample_mask = mask[i]

                # True tags (already indices)
                true_tags = targets[i][sample_mask]

                # Predicted tags (convert from list to tensor)
                pred_tags = torch.tensor(preds[i][:len(true_tags)], device=inputs.device)

                # Calculate correct predictions
                total_correct += (pred_tags == true_tags).sum().item()
                total_tokens += len(true_tags)

    accuracy = total_correct / total_tokens
    print(f"Token Accuracy: {accuracy:.4f}")
    return accuracy

In [56]:
# Ensure test_loader is properly defined:
test_loader = DataLoader(
    TensorDataset(X_test_tensor, y_test_tensor, test_mask),
    batch_size=64
)

# Run evaluation
results = evaluate(model, test_loader, tag_to_index)

Token Accuracy: 0.9325


In [57]:
test_sentence = ["Apple", "Inc.", "is", "based", "in", "California"]
# Use the predict function
prediction = predict(test_sentence, model, word_to_index, tag_to_index)

# Print results
print("Predicted Tags:")
for word, tag in prediction:
    print(f"{word}: {tag}")

Predicted Tags:
Apple: B-per
Inc.: I-per
is: O
based: O
in: O
California: B-geo
