In [42]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
import json
from tqdm import tqdm
from spacy.util import filter_spans
from spacy.training.iob_utils import offsets_to_biluo_tags

In [43]:
with open('Data/train/train_data.json','rb') as f:
    train_data=json.load(f)

In [44]:
# create a blank English NLP model
nlp = spacy.blank('en')

# Create the NER component and add it to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels to the NER component
for item in train_data:
    for _, _, label in item['entities']:
        ner.add_label(label)

# Prepare training data in the format required by spaCy 3.x
train_examples = []
count=0
for item in train_data:
    doc = nlp.make_doc(item["text"])
    ents = []
    for start, end, label in item['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
    
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    example = Example.from_dict(doc, {"entities": item['entities']})
    train_examples.append(example)



In [45]:
# Initialize the optimizer
optimizer = nlp.begin_training()

# Training loop
n_iter = 20
for itn in range(n_iter):
    random.shuffle(train_examples)
    losses = {}
    # Batch up the examples using spaCy's minibatch
    batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(
            batch,  # batch of Example objects
            drop=0.2,  # dropout - make it harder to memorise data
            sgd=optimizer,  # callable to update weights
            losses=losses
        )
    scores = nlp.evaluate(train_examples)
    ents_p = scores["ents_p"]
    ents_r = scores["ents_r"]
    ents_f = scores["ents_f"]

    print(f"Iteration {itn}: Losses: {losses['ner']}, Precision: {ents_p:.3f}, Recall: {ents_r:.3f}, F1-score: {ents_f:.3f}")

# Save the model
# nlp.to_disk("ner_model")

Iteration 0: Losses: 33538.01212059083, Precision: 0.999, Recall: 1.000, F1-score: 0.999
Iteration 1: Losses: 6338.971474058709, Precision: 0.964, Recall: 1.000, F1-score: 0.982
Iteration 2: Losses: 3992.416268166616, Precision: 0.939, Recall: 1.000, F1-score: 0.968
Iteration 3: Losses: 3251.762276514144, Precision: 0.904, Recall: 1.000, F1-score: 0.949
Iteration 4: Losses: 2788.5484109306753, Precision: 0.897, Recall: 1.000, F1-score: 0.946
Iteration 5: Losses: 2528.9015789304076, Precision: 0.855, Recall: 1.000, F1-score: 0.922
Iteration 6: Losses: 2260.192934722879, Precision: 0.926, Recall: 1.000, F1-score: 0.962
Iteration 7: Losses: 2143.315215059381, Precision: 0.953, Recall: 1.000, F1-score: 0.976
Iteration 8: Losses: 1950.2999223150791, Precision: 0.881, Recall: 1.000, F1-score: 0.937
Iteration 9: Losses: 1822.8046323318463, Precision: 0.951, Recall: 1.000, F1-score: 0.975
