In [2]:
from datasets import load_dataset
from tqdm import tqdm
import torch
import sys
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load the English portion of OntoNotes 5.0
ontonotes = load_dataset(
    "conll2012_ontonotesv5",
    "english_v12",
    cache_dir="./dataset/ontonotes",
)

In [26]:
def generate_span_labels(words, named_entities, span_length=6):
    """
    Generate spans and binary labels indicating whether each span contains a complete named entity.
    
    Args:
        words: List of words in the sentence
        named_entities: List of named entity labels (odd = begin entity, even = continue entity, 0 = no entity)
        span_length: Fixed length of spans to generate
    
    Returns:
        List of tuples: (span_words, label) where label is 0 (no/incomplete entity) or 1 (complete entity)
    """
    spans_and_labels = []
    
    # Only generate spans of exactly span_length
    for i in range(len(words) - span_length + 1):
        span_end = i + span_length
        span_words = words[i:span_end]
        span_entities = named_entities[i:span_end]
        
        # Check if span contains a complete named entity
        has_complete_entity = False
        
        # Find all entity starts (odd numbers) in the span
        entity_starts = [j for j, ne in enumerate(span_entities) if ne % 2 == 1 and ne > 0]
        
        for start_idx in entity_starts:
            # Check if this entity is complete within the span
            entity_complete = True
            
            # Look ahead from the start to see if entity continues
            for k in range(start_idx + 1, len(span_entities)):
                if span_entities[k] == 0:  # Entity ended
                    break
                elif span_entities[k] % 2 == 1:  # New entity started
                    break
                # If we reach here, it's an even number (continuation)
            
            # Check if entity continues beyond the span
            if span_end < len(named_entities):
                next_entity = named_entities[span_end]
                if next_entity % 2 == 0 and next_entity > 0:  # Entity continues beyond span
                    entity_complete = False
            
            if entity_complete:
                has_complete_entity = True
                break
        
        label = 1 if has_complete_entity else 0
        spans_and_labels.append((span_words, label))
    
    return spans_and_labels

In [28]:
X, y = [], []
SPAN_LENGTH = 6

for doc in tqdm(ontonotes['train'], desc="Processing Documents"):
    # Concatenate all words and named entities from all sentences in the document
    words = []
    named_entities = []
    
    for sentence in doc['sentences']:
        words.extend(sentence['words'])
        named_entities.extend(sentence['named_entities'])
    
    # Generate spans and labels for the entire document
    spans_and_labels = generate_span_labels(words, named_entities, SPAN_LENGTH)
    
    for span_words, label in spans_and_labels:
        X.append(span_words)
        y.append(label)
    
X = np.array(X)
y = np.array(y)

np.savez("ner_trigger_dataset.npz", X=X, y=y)
print(f"\nSaved dataset with {len(X)} examples to ner_trigger_dataset.npz")

Processing Documents: 100%|██████████| 10539/10539 [00:09<00:00, 1170.13it/s] 
Processing Documents: 100%|██████████| 10539/10539 [00:09<00:00, 1170.13it/s] 



Saved dataset with 2148223 examples to ner_trigger_dataset.npz


In [29]:
print("ner_trigger_dataset.npz is too large for git, therefore it's available online at https://drive.google.com/drive/folders/1ykTaDLdHIEmZQYN0b1Hr9hkOYjgMshSa?usp=sharing")

ner_trigger_dataset.npz is too large for git, therefore it's available online at https://drive.google.com/drive/folders/1ykTaDLdHIEmZQYN0b1Hr9hkOYjgMshSa?usp=sharing
