# Confidence Score Baseline

This notebook implements the confidence score (approach 1) baseline for near real-time Named Entity Recognition (NER).

## 1. Setup and imports

In [2]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset
from tqdm import tqdm
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
sys.path.append("src")
from utils import convert_ids_to_bio

## 2. Load OntoNotes dataset

In [5]:
# Load the English portion of OntoNotes 5.0
ontonotes = load_dataset(
    "conll2012_ontonotesv5",
    "english_v12",
    cache_dir="./dataset/ontonotes",
)
print(f"Dataset loaded with splits: {ontonotes.keys()}")

Dataset loaded with splits: dict_keys(['train', 'validation', 'test'])


## 3. Preprocessing
Here we create all window prefixes and pre-compute the CLS token over the OntoNotes test split.

In [12]:
prefixes = []
prefix_count = 0

# Iterate through the test split
for doc in tqdm(ontonotes["test"], desc="Processing documents", unit="doc"):
    # Fix: Sometimes doc['sentences'] is a list of lists, so we need to flatten it
    if isinstance(doc['sentences'], list) and isinstance(doc['sentences'][0], list):
        doc['sentences'] = [sentence for sublist in doc['sentences'] for sentence in sublist]
    for sentence in doc['sentences']:
        sentence_prefixes = []
        curr_prefix = []
        for word in sentence['words']:
            curr_prefix.append(word)
            prefix_count += 1
            sentence_prefixes.append(curr_prefix.copy())

        true_bio = convert_ids_to_bio(sentence['named_entities'])

        # Store the current prefix and BIO tags
        prefixes.append((true_bio, sentence_prefixes))

print(f"Total prefixes created: {prefix_count}")
print(f"Example prefix: {prefixes[0][0]}\n{"\n".join([str(x) for x in prefixes[0][1]])}")

Processing documents: 100%|██████████| 1200/1200 [00:00<00:00, 1336.49doc/s]

Total prefixes created: 230118
Example prefix: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['--']
['--', 'basically']
['--', 'basically', ',']
['--', 'basically', ',', 'it']
['--', 'basically', ',', 'it', 'was']
['--', 'basically', ',', 'it', 'was', 'unanimously']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the', 'various']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the', 'various', 'relevant']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the', 'various', 'relevant', 'parties']
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the', 'various'


