# File for testing stuff

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

In [None]:
# Load OntoNotes dataset
ontonotes = load_dataset("conll2012_ontonotesv5", "english_v12")
train_data = ontonotes["train"]

# Load BERT tokenizer and model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

#  Manually define the NER label list (OntoNotes-style)
label_list = [
    "O", "B-PERSON", "I-PERSON", "B-ORG", "I-ORG",
    "B-GPE", "I-GPE", "B-DATE", "I-DATE",
    "B-CARDINAL", "I-CARDINAL",
    "B-MONEY", "I-MONEY",
    "B-PERCENT", "I-PERCENT",
    "B-TIME", "I-TIME",
    "B-FAC", "I-FAC", "B-LOC", "I-LOC",
    "B-PRODUCT", "I-PRODUCT",
    "B-WORK_OF_ART", "I-WORK_OF_ART",
    "B-LAW", "I-LAW", "B-EVENT", "I-EVENT",
    "B-LANGUAGE", "I-LANGUAGE",
    "B-NORP", "I-NORP", "B-QUANTITY", "I-QUANTITY",
    "B-ORDINAL", "I-ORDINAL"
]


In [None]:

def extract_entities(words, ner_tags):
    """
    Returns binary labels Y for each timestep, where Y=1 marks the moment an entity is completed.
    Supports BIO tagging.
    """
    seen_entities = set()
    Y = []

    current_entity = []
    current_type = None

    for i in range(1, len(words) + 1):
        word = words[i - 1]
        tag = ner_tags[i - 1]
        label = 0

        next_tag = ner_tags[i] if i < len(ner_tags) else 'O'

        if tag.startswith("B-"):
            current_entity = [word]
            current_type = tag[2:]

            # Single-token entity
            if not next_tag.startswith("I-") or next_tag[2:] != current_type:
                ent_tuple = (tuple(current_entity), current_type)
                if ent_tuple not in seen_entities:
                    label = 1
                    seen_entities.add(ent_tuple)
                current_entity = []
                current_type = None

        elif tag.startswith("I-") and current_type == tag[2:]:
            current_entity.append(word)

            # Last token of multi-token entity
            if not next_tag.startswith("I-") or next_tag[2:] != current_type:
                ent_tuple = (tuple(current_entity), current_type)
                if ent_tuple not in seen_entities:
                    label = 1
                    seen_entities.add(ent_tuple)
                current_entity = []
                current_type = None

        else:
            current_entity = []
            current_type = None

        Y.append(label)

    return Y



def process_sentence(words, ner_ids):
    ner_tags = [label_list[i] for i in ner_ids]
    x_list, y_list = [], []

    # Get labels for each timestep using the updated extract_entities function
    timestep_labels = extract_entities(words, ner_tags)

    for i in range(1, len(words) + 1):
        partial = words[:i]
        partial_tags = ner_tags[:i]

        inputs = tokenizer(" ".join(partial), return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
            cls = outputs.last_hidden_state[0][0].numpy()

        y = timestep_labels[i - 1]

        x_list.append(cls)
        y_list.append(y)

        print(f"Timestep {i}")
        print(f"Partial sentence: {' '.join(partial)}")
        print(f"NER tags so far: {partial_tags}")
        print(f"Label Y: {y}")
        print("-" * 60)

    return x_list, y_list


  torch.utils._pytree._register_pytree_node(


In [3]:
print(len(train_data))

10539


In [None]:
X_all, Y_all = [], []

for idx in tqdm(range(len(train_data)), desc="Test run"):
    ex = train_data[idx]
    for sentence in ex["sentences"]:
        words = sentence["words"]
        ner_ids = sentence["named_entities"]
        x_seq, y_seq = process_sentence(words, ner_ids)
        X_all.extend(x_seq)
        Y_all.extend(y_seq)

print(f" Collected {len(X_all)} samples.")

X_array = np.array(X_all)
Y_array = np.array(Y_all)
np.savez("ontonotes_embeddings.npz", X=X_array, Y=Y_array)
print("Saved to ontonotes_embeddings.npz")

Test run:   0%|          | 0/1 [00:00<?, ?it/s]

Timestep 1
Partial sentence: What
NER tags so far: ['O']
Label Y: 0
------------------------------------------------------------
Timestep 2
Partial sentence: What kind
NER tags so far: ['O', 'O']
Label Y: 0
------------------------------------------------------------
Timestep 3
Partial sentence: What kind of
NER tags so far: ['O', 'O', 'O']
Label Y: 0
------------------------------------------------------------
Timestep 4
Partial sentence: What kind of memory
NER tags so far: ['O', 'O', 'O', 'O']
Label Y: 0
------------------------------------------------------------
Timestep 5
Partial sentence: What kind of memory ?
NER tags so far: ['O', 'O', 'O', 'O', 'O']
Label Y: 0
------------------------------------------------------------
Timestep 1
Partial sentence: We
NER tags so far: ['O']
Label Y: 0
------------------------------------------------------------
Timestep 2
Partial sentence: We respectfully
NER tags so far: ['O', 'O']
Label Y: 0
-------------------------------------------------

Test run: 100%|██████████| 1/1 [04:45<00:00, 285.79s/it]

Timestep 6
Partial sentence: This case will not be prosecuted
NER tags so far: ['O', 'O', 'O', 'O', 'O', 'O']
Label Y: 0
------------------------------------------------------------
Timestep 7
Partial sentence: This case will not be prosecuted .
NER tags so far: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
Label Y: 0
------------------------------------------------------------
 Collected 4579 samples.
Saved to ontonotes_embeddings.npz



