In [1]:
import pandas as pd

# Sample DataFrame with more text and labels
data = {
    'text': [
        "This is a sample text", 
        "Another sample text", 
        "A third sample text",
        "Text for the fourth example",
        "Fifth example text for testing",
        "Sixth sample text data",
        "Seventh example text",
        "Eighth sample data text",
        "Ninth text example for testing",
        "Tenth sample text data example"
    ],
    'labels': [
        "O O O O B-label", 
        "O O B-label O", 
        "B-label O O O",
        "O O O O O O", 
        "O O O O O O", 
        "O O O O", 
        "O O O O", 
        "O O O O", 
        "O O O O O O", 
        "O O O O O O O"
    ]
}
df = pd.DataFrame(data)

# Split the text and labels into lists
texts = df['text'].tolist()
labels = df['labels'].tolist()

print("Texts:", texts)
print("Labels:", labels)


Texts: ['This is a sample text', 'Another sample text', 'A third sample text', 'Text for the fourth example', 'Fifth example text for testing', 'Sixth sample text data', 'Seventh example text', 'Eighth sample data text', 'Ninth text example for testing', 'Tenth sample text data example']
Labels: ['O O O O B-label', 'O O B-label O', 'B-label O O O', 'O O O O O O', 'O O O O O O', 'O O O O', 'O O O O', 'O O O O', 'O O O O O O', 'O O O O O O O']


In [2]:
train_data = []

for text, label in zip(texts, labels):
    entities = []
    tokens = text.split()
    label_tokens = label.split()
    start = 0

    for token, lbl in zip(tokens, label_tokens):
        if lbl != 'O':
            entity = (start, start + len(token), lbl)
            entities.append(entity)
        start += len(token) + 1

    train_data.append((text, {"entities": entities}))

print("Training Data:", train_data)


Training Data: [('This is a sample text', {'entities': [(17, 21, 'B-label')]}), ('Another sample text', {'entities': [(15, 19, 'B-label')]}), ('A third sample text', {'entities': [(0, 1, 'B-label')]}), ('Text for the fourth example', {'entities': []}), ('Fifth example text for testing', {'entities': []}), ('Sixth sample text data', {'entities': []}), ('Seventh example text', {'entities': []}), ('Eighth sample data text', {'entities': []}), ('Ninth text example for testing', {'entities': []}), ('Tenth sample text data example', {'entities': []})]


In [None]:
!pip install spacy 
!python -m spacy download en_core_web_sm





In [None]:
nlp = spacy.load("en_core_web_sm")


In [None]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from spacy.lookups import Lookups

# Load pre-trained SpaCy model
nlp =spacy.load('en_core_web_sm')

# Load lookup tables
lookups = Lookups()
lookups.add_table("lexeme_norm", {"example": "example"})
nlp.vocab.lookups = lookups

# Get the NER component and add new labels
ner = nlp.get_pipe("ner")
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other components in the pipeline to only train NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

print("NER component and optimizer ready")




NER component and optimizer ready


In [None]:
import random

# Training loop
n_iter = 10
for itn in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    for batch in minibatch(train_data, size=compounding(4.0, 32.0, 1.001)):
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, losses=losses)
    print(f"Iteration {itn}, Losses: {losses}")

print("Training completed")


In [None]:
output_dir = "ner_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Model saved to ner_model


In [None]:
# Load the saved model
nlp = spacy.load(output_dir)

# Test the model
test_text = "This is a test sentence for the NER model"
doc = nlp(test_text)
print("Entities in '%s':" % test_text)
for ent in doc.ents:
    print(ent.text, ent.label_)
