## Here we train our own custom NER model

This is something exactly we are doing in main notebook, but this one is reproducible for others!.

In [1]:
import spacy
import random
from spacy.util import minibatch, compounding
from spacy.training import Example
import warnings
warnings.filterwarnings('ignore')

# Load the blank English language model
nlp = spacy.blank('en')

# Create a new named entity recognizer and add it to the pipeline
ner = nlp.add_pipe('ner')

# Define the categories of entities you want to recognize
labels = ['PERSON', 'ORG', 'GPE']

# Load the annotated data into Spacy format
TRAIN_DATA = [
    ('John Smith is a person', {'entities': [(0, 10, 'PERSON')]}),
    ('Acme Inc. is an organization', {'entities': [(0, 8, 'ORG')]}),
    ('London is a city', {'entities': [(0, 6, 'GPE')]}),
    # more examples...
]

# Convert the annotated data to Example objects
TRAIN_EXAMPLES = []
for text, annotations in TRAIN_DATA:
    entities = annotations.get('entities')
    example = Example.from_dict(nlp.make_doc(text), {'entities': entities})
    TRAIN_EXAMPLES.append(example)

# Define the training function
def train_spacy_ner(nlp, train_data, labels, n_iter=20):
    # Get the ner component from the pipeline
    ner = nlp.get_pipe('ner')

    # Add the labels to the ner component
    for label in labels:
        ner.add_label(label)

    # Disable other pipeline components that don't need to be trained
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        # Initialize the optimizer
        optimizer = nlp.begin_training()

        # Loop over the training data in batches
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                examples = []
                for example in batch:
                    examples.append(example)
                nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

            # Print the losses during training
            print('Iteration %d: Losses %s' % (itn, losses))

# Train the model using the annotated data
train_spacy_ner(nlp, TRAIN_EXAMPLES, labels)

# Save the trained model
nlp.to_disk('custom_ner_model')


Iteration 0: Losses {'ner': 11.28571343421936}
Iteration 1: Losses {'ner': 11.028032541275024}
Iteration 2: Losses {'ner': 10.733757138252258}
Iteration 3: Losses {'ner': 10.449460625648499}
Iteration 4: Losses {'ner': 10.11997103691101}
Iteration 5: Losses {'ner': 9.6016104221344}
Iteration 6: Losses {'ner': 9.049630999565125}
Iteration 7: Losses {'ner': 8.3404620885849}
Iteration 8: Losses {'ner': 7.712250351905823}
Iteration 9: Losses {'ner': 6.88342410326004}
Iteration 10: Losses {'ner': 5.032250463962555}
Iteration 11: Losses {'ner': 4.616854697465897}
Iteration 12: Losses {'ner': 3.442632034420967}
Iteration 13: Losses {'ner': 2.650576204061508}
Iteration 14: Losses {'ner': 2.4980225265026093}
Iteration 15: Losses {'ner': 2.1496297419071198}
Iteration 16: Losses {'ner': 2.26289115101099}
Iteration 17: Losses {'ner': 2.5032800608314574}
Iteration 18: Losses {'ner': 2.8626511560869403}
Iteration 19: Losses {'ner': 2.3896844245682587}
