# Window size model (approach 2) Baseline

This notebook implements the sliding window baseline for near real-time Named Entity Recognition (NER).

## 1. Setup and preparation

First, let's import the necessary libraries and set up our environment.

In [33]:
%pip install datasets transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [34]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import random
from tqdm import tqdm
import numpy as np
import sys

In [35]:
sys.path.append("src")
from utils import convert_ids_to_bio

## 2. Loading Dataset

In [36]:
# Load the English portion of OntoNotes 5.0
ontonotes = load_dataset(
    "conll2012_ontonotesv5",
    "english_v12",
    cache_dir="./dataset/ontonotes",
)
print(f"Dataset loaded with splits: {ontonotes.keys()}")

Dataset loaded with splits: dict_keys(['train', 'validation', 'test'])


## 3. Creating all window sizes of size 6 accross test split

In [37]:
windows = []
bio_windows = []
SPAN_LENGTH = 6

# Iterate through the test split
for doc in ontonotes["test"]:
    curr_window = []
    curr_bio_window = []
    # Fix: Sometimes doc['sentences'] is a list of lists, so we need to flatten it
    if isinstance(doc['sentences'], list) and isinstance(doc['sentences'][0], list):
        doc['sentences'] = [sentence for sublist in doc['sentences'] for sentence in sublist]
    for sentence in doc["sentences"]:
        for idx, word in enumerate(sentence['words']):
            curr_window.append(word)
            curr_bio_window.append(sentence['named_entities'][idx])
            # If the current window reaches the defined span length, add it to the list
            if len(curr_window) == SPAN_LENGTH:
                windows.append(curr_window.copy())
                bio_windows.append(convert_ids_to_bio(curr_bio_window))
                # Slide the window by one position
                curr_window = curr_window[1:]
                curr_bio_window = curr_bio_window[1:]

print(f"Total windows created: {len(windows)}")
ix = random.randint(0, len(windows) - 1)
print(f"Example window: {windows[ix]}, BIO: {bio_windows[ix]}")


Total windows created: 224128
Example window: ['equipment', 'and', 'the', 'engineers', 'and', 'they'], BIO: ['O', 'O', 'O', 'O', 'O', 'O']


In [38]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
bert_model = AutoModel.from_pretrained("dslim/bert-base-NER")

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
print(f"Using device: {device}")

bert_model.to(device)
bert_model.eval()

Using device: mps


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [39]:
# Calculate all embeddings for the windows in a batch-wise manner
batch_size = 128  # Adjust batch size based on your GPU memory
embeddings = []
for i in tqdm(range(0, len(windows), batch_size), desc="Computing CLS token for windows"):
    batch = windows[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, is_split_into_words=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    cls_token = outputs.last_hidden_state[:, 0, :]
    embeddings.append(cls_token.cpu().numpy())

embeddings = np.concatenate(embeddings, axis=0)

Computing CLS token for windows: 100%|██████████| 1751/1751 [01:56<00:00, 15.07it/s]


In [42]:
np.savez("data/ner_trigger_dataset_test_embeddings.npz", windows=windows, bio_windows=bio_windows, embeddings=embeddings)

In [43]:
data = np.load("data/ner_trigger_dataset_test_embeddings.npz")
windows, bio_windows, embeddings = data['windows'], data['bio_windows'], data['embeddings']
print(f"Shape of data: {windows.shape}; {bio_windows.shape}; {embeddings.shape}")

Shape of data: (224128, 6); (224128, 6); (224128, 768)
