In [1]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
import json
from tqdm import tqdm
from spacy.util import filter_spans

In [2]:
with open('Data/train/train_data.json','rb') as f:
    train_data=json.load(f)

In [None]:
# create a blank English NLP model
nlp = spacy.blank('en')

# Create the NER component and add it to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
for item in train_data:
    for _, _, label in item['entities']:
        ner.add_label(label)

# Prepare training data in the format required by spaCy 3.x
train_examples = []
count=0
for item in train_data:
    doc = nlp.make_doc(item["text"])
    ents = []
    for start, end, label in item['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
    
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    example = Example.from_dict(doc, {"entities": item['entities']})
    train_examples.append(example)



In [4]:
# Initialize the optimizer
optimizer = nlp.begin_training()

# Training loop
n_iter = 300
for itn in range(n_iter):
    random.shuffle(train_examples)
    losses = {}
    # Batch up the examples using spaCy's minibatch
    batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(
            batch,  # batch of Example objects
            drop=0.2,  # dropout - make it harder to memorise data
            sgd=optimizer,  # callable to update weights
            losses=losses
        )
    scores = nlp.evaluate(train_examples)
    ents_p = scores["ents_p"]
    ents_r = scores["ents_r"]
    ents_f = scores["ents_f"]

    print(f"Iteration {itn+1}: Losses: {losses['ner']:.3f}, Precision: {ents_p:.3f}, Recall: {ents_r:.3f}, F1-score: {ents_f:.3f}")

# Save the model
nlp.to_disk("ner_model")

Iteration 1: Losses: 30068.822, Precision: 1.000, Recall: 1.000, F1-score: 1.000
Iteration 2: Losses: 5854.438, Precision: 0.973, Recall: 1.000, F1-score: 0.986
Iteration 3: Losses: 3980.045, Precision: 0.942, Recall: 1.000, F1-score: 0.970
Iteration 4: Losses: 3219.849, Precision: 0.909, Recall: 1.000, F1-score: 0.952
Iteration 5: Losses: 2963.791, Precision: 0.945, Recall: 1.000, F1-score: 0.972
Iteration 6: Losses: 2593.452, Precision: 0.954, Recall: 1.000, F1-score: 0.977
Iteration 7: Losses: 2241.810, Precision: 0.912, Recall: 1.000, F1-score: 0.954
Iteration 8: Losses: 2080.693, Precision: 0.937, Recall: 1.000, F1-score: 0.968
Iteration 9: Losses: 1917.054, Precision: 0.914, Recall: 1.000, F1-score: 0.955
Iteration 10: Losses: 1789.135, Precision: 0.850, Recall: 1.000, F1-score: 0.919
Iteration 11: Losses: 1719.466, Precision: 0.931, Recall: 1.000, F1-score: 0.964
Iteration 12: Losses: 1636.636, Precision: 0.853, Recall: 1.000, F1-score: 0.921
Iteration 13: Losses: 1609.569, Prec