# **Using Encoder-only LLM to perform Named Entity Extraction**

**Author: Partha Seetala**

Video Tutorial: [https://youtu.be/UJZ4HGLnSMU](https://youtu.be/UJZ4HGLnSMU)

# **Import required modules**

In [None]:
import os
os.environ["WANDB_MODE"] = "offline"   # disable W&B prompts

import torch
import torch.nn.functional as F
import numpy as np
from datasets import Dataset
from datasets import load_dataset
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

import random
import json
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
import ast

#**Load Google Drive into Colab to load and save our models**

In [None]:
from google.colab import drive
import os
from pathlib import Path

drive.mount('/content/drive')

USECASE_NAME = "s3e2-extract-entities"

mit_ner_data = "mit-ner-dataset.csv"
wikipedia_ner_data = "wikipedia-ner-dataset.tsv"

DATASET_TYPE = "wikipedia" # "mit" | "wikipedia"

if DATASET_TYPE == "mit":
    selected_dataset = mit_ner_data
elif DATASET_TYPE == "wikipedia":
    selected_dataset = wikipedia_ner_data
else:
    raise ValueError(f"Unknown dataset type: {DATASET_TYPE}")

DATASET_NAME = Path(selected_dataset).stem

MODEL_NAME = "google/electra-base-discriminator" # "bert-base-uncased"
max_seq_len = 128

ROOTDIR = '/content/drive/MyDrive/cidl'

HF_TOKEN_FILEPATH = os.path.join(ROOTDIR, "hf.token")
DATASET_DIR = os.path.join(ROOTDIR, 'datasets')  # ~/cidl/datasets
PRETRAINED_MODEL_DIR = os.path.join(ROOTDIR, 'models', 'pretrained', 'bert')  # ~/cidl/models/pretrained/bert
FINETUNED_MODEL_DIR = os.path.join(ROOTDIR, 'models', 'finetuned', 'bert', USECASE_NAME, DATASET_NAME)  # ~/cidl/models/finetuned/bert/<usecase-name>/<dataset-name>

def load_hugging_face_token(hf_token_filepath=HF_TOKEN_FILEPATH):
    if os.path.exists(hf_token_filepath):
        with open(hf_token_filepath, "r") as f:
            hf_token = f.read().strip()

        if hf_token.startswith("hf_"):
            # Login and set environment variable
            os.environ["HF_TOKEN"] = hf_token
            #login(token=hf_token, add_to_git_credential=False)
        else:
            print(f"Malformed Hugging Face token file at: {hf_token_filepath}")
    else:
        print(f"Hugging Face token file not found at: {hf_token_filepath}")

for dirpath in [DATASET_DIR, PRETRAINED_MODEL_DIR, FINETUNED_MODEL_DIR]:
    os.makedirs(dirpath, exist_ok=True)

load_hugging_face_token(HF_TOKEN_FILEPATH)

print("Dataset directory .................................... ", DATASET_DIR)
print("Location where pretrained model will be downloaded ... ", PRETRAINED_MODEL_DIR)
print("Location where finetuned model will be stored ........ ", FINETUNED_MODEL_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset directory ....................................  /content/drive/MyDrive/cidl/datasets
Location where pretrained model will be downloaded ...  /content/drive/MyDrive/cidl/models/pretrained/bert
Location where finetuned model will be stored ........  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-extract-entities/wikipedia-ner-dataset


# **STEP 1: Build our custom Entity Extraction Model**

In [None]:
class ExtractEntityModel(torch.nn.Module):
    def __init__(self, model_name, cache_dir, num_ner_tags, device=None):
        super().__init__()
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = device

        from transformers import AutoModel
        self.model_name = model_name
        self.num_ner_tags = num_ner_tags  # N

        self.base_model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
        self.base_model = self.base_model.to(device)

        # SxD

        hidden_size = self.base_model.config.hidden_size  # D

        # [SxD] -> FFN  (DxD) -> ReLU() -> [DxN]
        # [SxD]*[DxD] => [SxD] -> ReLU() -> [SxD]*[DxN] => [SxN]
        self.ner_classifier = torch.nn.Sequential(
            torch.nn.Linear(hidden_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, num_ner_tags)
        ).to(device)
        self.ner_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # John lives in New York -> [CLS, John, lives, in, New, York, PAD, PAD] ->
        # input_ids=[5000, 23, 43, 8, 90, 89, 5001, 5001]
        # attn_mask=[1, 1, 1, 1, 1, 1, 0, 0]
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = output.last_hidden_state  # [SxD]
        ner_logits = self.ner_classifier(last_hidden)

        loss = None
        if labels is not None:
            loss = self.ner_loss_fn(ner_logits.view(-1, ner_logits.shape[-1]), labels.view(-1))

        return {
            "loss": loss,
            "logits": ner_logits
        }

    def save_pretrained(self, save_directory, tag2id=None, id2tag=None):
        import os
        from transformers import AutoConfig
        os.makedirs(save_directory, exist_ok=True)
        self.base_model.save_pretrained(save_directory)
        torch.save({
            "ner_classifier": self.ner_classifier.state_dict()
        }, os.path.join(save_directory, "custom_classifier_heads.bin"))

        hf_config = AutoConfig.from_pretrained(self.model_name)
        hf_config.model_name = self.model_name
        hf_config.num_ner_tags = self.num_ner_tags
        hf_config.save_pretrained(save_directory)

        if tag2id and id2tag:
            with open(os.path.join(save_directory, "ner_tag2id.json"), "w") as f:
                d = {str(k): int(v) for k, v in tag2id.items()}
                print(d)
                json.dump({str(k): int(v) for k, v in tag2id.items()}, f)
            with open(os.path.join(save_directory, "ner_id2tag.json"), "w") as f:
                d = {str(k): v for k, v in id2tag.items()}
                json.dump({str(k): v for k, v in id2tag.items()}, f)

        check_files = ["custom_classifier_heads.bin", "ner_tag2id.json", "ner_id2tag.json"]
        for f in check_files:
            if not os.path.exists(os.path.join(save_directory, f)):
                raise ValueError(f"Missing file: {f}")

    @classmethod
    def from_pretrained(cls, load_directory):
        from transformers import AutoConfig, AutoModel
        import torch
        hf_config = AutoConfig.from_pretrained(load_directory)
        model = cls(
            model_name=hf_config.model_name,
            cache_dir=None,
            num_ner_tags=hf_config.num_ner_tags)

        model.base_model = AutoModel.from_pretrained(load_directory)
        head_weights = torch.load(os.path.join(load_directory, "custom_classifier_heads.bin"), map_location="cpu")
        model.ner_classifier.load_state_dict(head_weights["ner_classifier"])

        model.eval()

        with open(os.path.join(load_directory, "ner_id2tag.json")) as f:
            id2tag = {int(k): v for k, v in json.load(f).items()}

        with open(os.path.join(load_directory, "ner_tag2id.json")) as f:
            tag2id = json.load(f)

        return model, {
            "model_name": hf_config.model_name,
            "num_ner_tags": hf_config.num_ner_tags,
            "id2tag": id2tag,
            "tag2id": tag2id
        }

def download_pretrained_encoder_only_model(model_dir, model_name):
    # Download the pre-trained Encoder-only Transformer model and the pre-trained Tokenizer for that model
    AutoTokenizer.from_pretrained(model_name, cache_dir=model_dir)
    AutoModel.from_pretrained(model_name, cache_dir=model_dir)

def build_entity_extraction_model(model_dir, model_name, num_ner_tags):
    return ExtractEntityModel(model_name=model_name, cache_dir=model_dir, num_ner_tags=num_ner_tags)

def finetune_entity_extraction_model(model, model_dir, train_dataset, val_dataset, epochs=3):
    def collate_fn(batch):
        return {
            "input_ids": torch.tensor([x["input_ids"] for x in batch], dtype=torch.long),
            "attention_mask": torch.tensor([x["attention_mask"] for x in batch], dtype=torch.long),
            "labels": torch.tensor([x["ner_tags"] for x in batch], dtype=torch.long)
        }

    output_dir = os.path.join(model_dir, "checkpoints")
    log_dir    = os.path.join(model_dir, "logs")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)

    training_args = TrainingArguments(
        run_name="extract-entities",
        output_dir=output_dir,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=epochs,
        logging_steps=10,
        logging_dir=log_dir,
        save_total_limit=min(epochs, 3),
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        remove_unused_columns=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=collate_fn
    )
    trainer.train()

def display_entity_extraction_model(model):
    print("\n{:<60} {:<20} {:>15} {:>12}".format("Layer (type)", "Shape (S×D)", "Param #", "Trainable"))
    print("=" * 115)

    total_params = 0
    trainable_params = 0
    total_bytes = 0

    for name, param in model.named_parameters():
        shape = " × ".join(map(str, param.shape))
        num_params = param.numel()
        size_bytes = param.element_size() * num_params
        total_params += num_params
        total_bytes += size_bytes
        if param.requires_grad:
            trainable_params += num_params
        print(f"{name:<60} {shape:<20} {num_params:>15,} {str(param.requires_grad):>12}")

    print("=" * 115)
    print(f"Total Parameters         : {total_params:,}")
    print(f"Trainable Parameters     : {trainable_params:,}")
    print(f"Non-trainable Parameters : {total_params - trainable_params:,}")
    print(f"Model Size (approx)      : {total_bytes / (1024**2):.2f} MiB  ({total_bytes / (1024**3):.2f} GiB)")

def save_finetuned_entity_extraction_model(tokenizer, model, finetuned_model_dir, tags2id, id2tags):
    print("Saving finetuned model to dir: ", finetuned_model_dir)
    model.save_pretrained(finetuned_model_dir, tags2id, id2tags)
    tokenizer.save_pretrained(finetuned_model_dir)

def load_finetuned_entity_extraction_model(finetuned_model_dir):
    print("Loading model from:", finetuned_model_dir)
    model, config = ExtractEntityModel.from_pretrained(finetuned_model_dir)
    tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)
    return tokenizer, model, config

def extract_entities_from_text(tokenizer, model, text, id2tag, max_seqlen=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Tokenize input text
    encoded = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_seqlen,
        return_offsets_mapping=True,
        return_token_type_ids=False
    )

    offset_mapping = encoded.pop("offset_mapping")[0].tolist()
    word_ids = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        padding="max_length",
        max_length=max_seqlen
    ).word_ids()

    inputs = {k: v.to(device) for k, v in encoded.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs["logits"]
    predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
    tags = [id2tag.get(p, "O") for p in predictions]

    # Step 1: collect raw entity spans
    raw_entities = []
    current_type = None
    current_start = None
    current_end = None

    for tag, (start, end), word_idx in zip(tags, offset_mapping, word_ids):
        if word_idx is None or start == end:
            continue

        if tag == "O":
            if current_type:
                raw_entities.append({
                    "entity": text[current_start:current_end],
                    "type": current_type,
                    "start": current_start,
                    "end": current_end
                })
                current_type = None
            continue

        prefix, label = tag.split("-", 1)

        if prefix == "B":
            if current_type:
                raw_entities.append({
                    "entity": text[current_start:current_end],
                    "type": current_type,
                    "start": current_start,
                    "end": current_end
                })
            current_type = label
            current_start = start
            current_end = end
        elif prefix == "I" and current_type == label:
            current_end = end
        else:
            if current_type:
                raw_entities.append({
                    "entity": text[current_start:current_end],
                    "type": current_type,
                    "start": current_start,
                    "end": current_end
                })
            current_type = None

    if current_type:
        raw_entities.append({
            "entity": text[current_start:current_end],
            "type": current_type,
            "start": current_start,
            "end": current_end
        })

    # Step 2: merge adjacent tokens of the same type (excluding 'O')
    grouped = {}
    prev_type = None
    buffer = []

    def flush():
        nonlocal buffer, prev_type
        if buffer and prev_type and prev_type != "O":
            phrase = " ".join(buffer)
            if prev_type not in grouped:
                grouped[prev_type] = []
            grouped[prev_type].append(phrase)
        buffer = []

    for ent in raw_entities:
        ent_type = ent["type"]
        ent_text = ent["entity"].strip()

        if ent_type == prev_type:
            buffer.append(ent_text)
        else:
            flush()
            buffer = [ent_text]
            prev_type = ent_type

    flush()
    return grouped


In [None]:
print("Downloading pretrained Encoder-only LLM model '{}' into '{}".format(MODEL_NAME, PRETRAINED_MODEL_DIR))
download_pretrained_encoder_only_model(model_dir=PRETRAINED_MODEL_DIR, model_name=MODEL_NAME)

Downloading pretrained Encoder-only LLM model 'google/electra-base-discriminator' into '/content/drive/MyDrive/cidl/models/pretrained/bert


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#**STEP 2: Finetune our custom model by training with task-specific training data**

**Prepare Dataset for finetuning our model**

In [None]:
from datasets import Dataset
import pandas as pd
import ast

def parse_tsv_fileformat(filepath):
    def build_sentence_and_bio_entities(tokens, tags):
        text = ""
        entities = []

        for token, tag in zip(tokens, tags):
            if text:
                text += " "
            start = len(text)
            text += token
            end = len(text)

            entities.append({
                "text": token,
                "type": tag,
                "start": start,
                "end": end
            })

        return text, entities

    sentences = []
    tokens = []
    tags = []

    with open(filepath, "r", encoding="utf-8") as f:
        count = 0
        for line in f:
            line = line.strip()

            count += 1

            if not line:
                if tokens:
                    text, entities = build_sentence_and_bio_entities(tokens, tags)
                    sentences.append({"text": text, "entities": entities})
                    tokens = []
                    tags = []
                continue

            parts = line.split("\t")
            if len(parts) < 3:
                continue

            token = parts[1]
            tag = parts[2]
            tokens.append(token)
            tags.append(tag)

        # Final flush
        if tokens:
            text, entities = build_sentence_and_bio_entities(tokens, tags)
            sentences.append({"text": text, "entities": entities})

    return pd.DataFrame(sentences)



def parse_csv_fileformat(filepath):
    df = pd.read_csv(filepath)
    return df


def prepare_dataset_for_finetuning(dataset_filepath, tokenizer, max_seqlen=128):

    # preprocessed directory path
    preprocessed_dir = os.path.splitext(dataset_filepath)[0] + "-preprocessed"
    dataset_cache_file = os.path.join(preprocessed_dir, "dataset.ds")
    id2tag_file = os.path.join(preprocessed_dir, "id2tag.bin")
    tag2id_file = os.path.join(preprocessed_dir, "tag2id.bin")

    # FAST PATH: Check if preprocessed files exist
    if all(os.path.exists(f) for f in [dataset_cache_file, id2tag_file, tag2id_file]):
        dataset = load_from_disk(dataset_cache_file)
        with open(id2tag_file, "r") as f:
            id2tag = json.load(f)
            id2tag = {int(k): v for k, v in id2tag.items()}
        with open(tag2id_file, "r") as f:
            tag2id = json.load(f)
        train_test = dataset.train_test_split(test_size=0.1)
        return train_test["train"], train_test["test"], tag2id, id2tag

    # SLOW PATH: build the dataframe
    if dataset_filepath.endswith(".tsv"):
        df = parse_tsv_fileformat(dataset_filepath)
    elif dataset_filepath.endswith(".csv"):
        df = parse_csv_fileformat(dataset_filepath)
    else:
        raise ValueError(f"Unknown file format: {dataset_filepath}")

    df.columns = df.columns.str.lower()
    df["text"] = df["text"].astype(str)
    if dataset_filepath.endswith(".csv"):
        df["entities"] = df["entities"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and isinstance(x, str) and x.strip() else [])

    texts = df["text"].tolist()
    all_word_lists = []
    all_word_labels = []

    tag_set = {"O"}


    # I live in California ->
    # [I, live, in, California]
    # [O, O,    O,  B_LOC]
    # [I, live, in, Cali, #fornia]
    # [O, O,    O,  B_LOC, I_LOC]

    for text, entities in zip(df["text"], df["entities"]):
        words = text.split()
        word_starts = []
        pos = 0
        for word in words:
            start = text.find(word, pos)
            word_starts.append(start)
            pos = start + len(word)

        word_tags = ["O"] * len(words)
        for ent in entities:
            ent_start = ent["start"]
            ent_end = ent["end"]
            ent_type = ent["type"].split("-")[-1]

            for i, start in enumerate(word_starts):
                end = start + len(words[i])
                if start == ent_start:
                    word_tags[i] = f"B-{ent_type}"
                elif ent_start < start < ent_end:
                    word_tags[i] = f"I-{ent_type}"

        all_word_lists.append(words)
        all_word_labels.append(word_tags)
        tag_set.update(word_tags)

    for tag in list(tag_set):
        if tag.startswith("B-"):
            i_tag = tag.replace("B-", "I-")
            tag_set.add(i_tag)

    tag_list = sorted(tag_set)
    tag2id = {tag: i for i, tag in enumerate(tag_list)}
    id2tag = {i: tag for tag, i in tag2id.items()}

    tokenized = tokenizer(
        all_word_lists,
        is_split_into_words=True,
        return_offsets_mapping=False,
        padding="max_length",
        truncation=True,
        max_length=max_seqlen
    )

    all_labels = []

    for i, word_ids in enumerate(tokenized.word_ids(batch_index=i) for i in range(len(tokenized["input_ids"]))):
        word_tags = all_word_labels[i]
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(tag2id[word_tags[word_idx]])
            else:
                # same word as previous token → propagate I-*
                label = word_tags[word_idx]
                if label.startswith("B-"):
                    label = label.replace("B-", "I-")
                label_ids.append(tag2id[label])
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    dataset = Dataset.from_dict({
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "ner_tags": all_labels
    })

    # SAVE in preprocessed cache for future use
    os.makedirs(preprocessed_dir, exist_ok=True)
    dataset.save_to_disk(dataset_cache_file)
    with open(id2tag_file, "w") as f:
        json.dump({str(k): v for k, v in id2tag.items()}, f)
    with open(tag2id_file, "w") as f:
        json.dump(tag2id, f)

    train_test = dataset.train_test_split(test_size=0.1)
    return train_test["train"], train_test["test"], tag2id, id2tag



DATASET_FILE_PATH = os.path.join(DATASET_DIR, selected_dataset)

print("Loading training dataset from: ", DATASET_FILE_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=PRETRAINED_MODEL_DIR)

train_dataset, val_dataset, tags2id, id2tags = prepare_dataset_for_finetuning(dataset_filepath=DATASET_FILE_PATH, tokenizer=tokenizer, max_seqlen=max_seq_len)

print("number of messages for training: {}".format(len(train_dataset)))
print("number of messages for validation: {}".format(len(val_dataset)))
print("number of tags {} {}: ".format(len(tags2id), tags2id))

Loading training dataset from:  /content/drive/MyDrive/cidl/datasets/wikipedia-ner-dataset.tsv
number of messages for training: 118152
number of messages for validation: 13128
number of tags 33 {'B-ANIM': 0, 'B-BIO': 1, 'B-CEL': 2, 'B-DIS': 3, 'B-EVE': 4, 'B-FOOD': 5, 'B-INST': 6, 'B-LOC': 7, 'B-MEDIA': 8, 'B-MYTH': 9, 'B-O': 10, 'B-ORG': 11, 'B-PER': 12, 'B-PLANT': 13, 'B-TIME': 14, 'B-VEHI': 15, 'I-ANIM': 16, 'I-BIO': 17, 'I-CEL': 18, 'I-DIS': 19, 'I-EVE': 20, 'I-FOOD': 21, 'I-INST': 22, 'I-LOC': 23, 'I-MEDIA': 24, 'I-MYTH': 25, 'I-O': 26, 'I-ORG': 27, 'I-PER': 28, 'I-PLANT': 29, 'I-TIME': 30, 'I-VEHI': 31, 'O': 32}: 


**Build the model and run the finetuning training run**

In [None]:
model = build_entity_extraction_model(model_dir=PRETRAINED_MODEL_DIR, model_name=MODEL_NAME, num_ner_tags=len(tags2id))

finetune_entity_extraction_model(model=model, model_dir=FINETUNED_MODEL_DIR, train_dataset=train_dataset, val_dataset=val_dataset, epochs=3)

Epoch,Training Loss,Validation Loss
1,0.0405,0.064871
2,0.0438,0.057069
3,0.0345,0.060788


**Save our finetuned model for future use**

In [None]:
save_finetuned_entity_extraction_model(tokenizer, model, FINETUNED_MODEL_DIR, tags2id, id2tags)

Saving finetuned model to dir:  /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-extract-entities/wikipedia-ner-dataset


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

{'B-ANIM': 0, 'B-BIO': 1, 'B-CEL': 2, 'B-DIS': 3, 'B-EVE': 4, 'B-FOOD': 5, 'B-INST': 6, 'B-LOC': 7, 'B-MEDIA': 8, 'B-MYTH': 9, 'B-O': 10, 'B-ORG': 11, 'B-PER': 12, 'B-PLANT': 13, 'B-TIME': 14, 'B-VEHI': 15, 'I-ANIM': 16, 'I-BIO': 17, 'I-CEL': 18, 'I-DIS': 19, 'I-EVE': 20, 'I-FOOD': 21, 'I-INST': 22, 'I-LOC': 23, 'I-MEDIA': 24, 'I-MYTH': 25, 'I-O': 26, 'I-ORG': 27, 'I-PER': 28, 'I-PLANT': 29, 'I-TIME': 30, 'I-VEHI': 31, 'O': 32}


#**STEP 3: Use our finetuned custom model to extract entities from text**

**Load our previously finetuned Entity Extraction Model**

In [None]:
tokenizer, model, config = load_finetuned_entity_extraction_model(FINETUNED_MODEL_DIR)
id2tag = config["id2tag"]

Loading model from: /content/drive/MyDrive/cidl/models/finetuned/bert/s3e2-extract-entities/wikipedia-ner-dataset


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

**Pass text and extract entities**

In [None]:
import time

sentences = [
    "Capital of India is New Delhi just like Paris is for France and Washington DC is for USA",
    "Tom Hanks acted in the movie Cast Away, in which he was lost at sea while on a Fedex plane",
    "Steph Curry plays for the Golden State Warriors, a basketball team in United States, which was earlier owned by Vivek Ranadive",
    "Virat Kholi, Roger Federer, Lebron James, are legends who represented Indian Cricket Team, Swiss National Tennis Team and LA Lakers",
    "Partha Seetala, who lives in Los Altos Hills, flew to meet  Sheik Abul-bin Sultan-el Hakimul Khan, who works for The Emirates Group",
]

total_ms = 0
for text in sentences:
    start_time = time.time()

    entities = extract_entities_from_text(tokenizer, model, text, id2tag)

    elapsed_ms = round((time.time() - start_time) * 1000)
    total_ms += elapsed_ms

    print(f"TEXT: {text}\n → Time: {elapsed_ms} ms\n → Entities: {entities}\n")



TEXT: Capital of India is New Delhi just like Paris is for France and Washington DC is for USA
 → Time: 5045 ms
 → Entities: {'LOC': ['India', 'New Delhi', 'Paris', 'France', 'Washington DC', 'USA']}

TEXT: Tom Hanks acted in the movie Cast Away, in which he was lost at sea while on a Fedex plane
 → Time: 14 ms
 → Entities: {'PER': ['Tom Hanks'], 'ORG': ['Fedex']}

TEXT: Steph Curry plays for the Golden State Warriors, a basketball team in United States, which was earlier owned by Vivek Ranadive
 → Time: 14 ms
 → Entities: {'PER': ['Steph Curry', 'Vivek Ranadive'], 'ORG': ['Golden State Warriors'], 'LOC': ['United States']}

TEXT: Virat Kholi, Roger Federer, Lebron James, are legends who represented Indian Cricket Team, Swiss National Tennis Team and LA Lakers
 → Time: 13 ms
 → Entities: {'PER': ['Virat Kholi', 'Roger Federer', 'Lebron James'], 'ORG': ['Indian Cricket Team', 'Swiss National Tennis Team', 'LA Lakers']}

TEXT: Partha Seetala, who lives in Los Altos Hills, flew to meet  S

**Bonus example: download text from URL and extract entities from there**

In [None]:
def extract_entities_from_long_text_in_chunks(tokenizer, model, text, id2tag, max_seqlen=128, stride=64):
    from collections import defaultdict

    # Tokenize entire input to get full tokens + offsets
    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        return_token_type_ids=False,
        return_attention_mask=False,
        truncation=False,
        padding=False
    )

    input_ids = encoding["input_ids"]
    offsets = encoding["offset_mapping"]
    total_len = len(input_ids)

    grouped_entities = defaultdict(list)

    for start in range(0, total_len, stride):
        end = min(start + max_seqlen, total_len)

        # Use the character span for this token slice
        start_char = offsets[start][0]
        end_char = offsets[end - 1][1]

        chunk_text = text[start_char:end_char]

        # Call your existing extract_entities function
        chunk_entities = extract_entities_from_text(tokenizer, model, chunk_text, id2tag, max_seqlen=max_seqlen)

        # Merge entities, skip duplicates
        for ent_type, values in chunk_entities.items():
            for val in values:
                if val not in grouped_entities[ent_type]:
                    grouped_entities[ent_type].append(val)

        if end == total_len:
            break  # Finished last chunk

    return dict(grouped_entities)


In [None]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # raise error for bad status codes

        soup = BeautifulSoup(response.text, "html.parser")

        # Remove script and style tags
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()

        # Get plain text
        text = soup.get_text(separator="\n")

        # Clean up: remove extra blank lines and whitespace
        lines = [line.strip() for line in text.splitlines()]
        lines = [line for line in lines if line]  # remove empty lines
        clean_text = "\n".join(lines)

        return clean_text

    except Exception as e:
        return f"Error fetching or parsing URL: {e}"


url = "https://en.wikipedia.org/wiki/Sam_Altman"
text = extract_text_from_url(url)


entities = extract_entities_from_long_text_in_chunks(tokenizer, model, text, id2tags)

for ent in entities:
    print(ent, entities[ent])

Token indices sequence length is longer than the specified maximum sequence length for this model (11305 > 512). Running this sequence through the model will result in indexing errors


PER ['Sam Altman', 'Samuel Harris Altman', 'Sam Altman Altman', 'Oliver Mulherin', 'Patrick Chung', 'Paul Graham', 'Jack Altman', 'Elon Musk', 'Jessica Livingston', 'Peter Thiel', 'Altman', 'Greg Brockman', 'Rishi Sunak', 'Emmanuel Macron', 'Pedro Sánchez', 'Olaf Scholz', 'Narendra Modi', 'Yoon Suk-yeol', 'Isaac Herzog', 'Ursula von', 'Ursula von der Leyen', 'Helen Toner', "Adam D'Angelo", 'Tasha', 'Tasha McCauley', 'Ilya Sutskever', 'Brock', 'Satya Nadella', 'Brockman', "D'Angelo", 'Demis Hassabis', 'Dario Amodei', 'Andrew Yang', 'Joe Biden', 'Dean Phillips', 'Daniel Lurie', 'Donald Trump', 'Mark Warner', 'Yishan Wong', 'Steve Huffman', 'Michael Klein', 'Bret Taylor', 'Balaji Srinivasan', 'Nancy Pelosi', 'Ric Weiland', 'Walter Isaacson', 'Nick Sivo', 'Ann Altman', 'Connie', 'Max', 'Jack', 'Weil', 'Elizabeth', 'Mickle', 'Trip', 'Metz', 'Cade', 'Isaac', 'Mike', 'Weise', 'Karen', 'Tripp', 'Ben Thompson', 'Chapman', 'Glenn', 'Michael', 'Jacob', 'Ben', 'Ricky', 'Schneider', 'Sharon', 'Jere