In [51]:
import json
import random
import spacy
from spacy.util import minibatch
from spacy.training.example import Example

In [52]:
with open('data/labeled-data-hybrid.json', 'r') as file:
    raw_data = json.load(file)

In [53]:
raw_data[0]

{'Unnamed: 0': 0,
 'additional_info': 'In our school we have an Outdoors Instructor who would love the chance to develop a veg patch in her outdoor space to support our SEND students in growing and cooking their own food. The equipment and seeds provided in the giveaway would develop our students understanding of how to grow your own food but also the process of growing your own food.',
 'id': 2015,
 'label': [{'start': 74,
   'end': 93,
   'text': 'develop a veg patch',
   'labels': ['Usage']},
  {'start': 115,
   'end': 182,
   'text': 'to support our SEND students in growing and cooking their own food.',
   'labels': ['Benefit']},
  {'start': 259,
   'end': 348,
   'text': 'understanding of how to grow your own food but also the process of growing your own food.',
   'labels': ['Benefit']}],
 'annotator': 1,
 'annotation_id': 7089,
 'created_at': '2025-04-12T21:16:24.879659Z',
 'updated_at': '2025-04-12T21:16:24.879683Z',
 'lead_time': 42.768}

In [54]:
TRAIN_DATA = []

for record in raw_data:
    text = record["additional_info"]
    entities = []
    for ann in record["label"]:
        entity_label = ann["labels"][0].upper()
        start = ann["start"]
        end = ann["end"]
        entities.append((start, end, entity_label))
    TRAIN_DATA.append((text, {"entities": entities}))

In [55]:
TRAIN_DATA[:3]

[('In our school we have an Outdoors Instructor who would love the chance to develop a veg patch in her outdoor space to support our SEND students in growing and cooking their own food. The equipment and seeds provided in the giveaway would develop our students understanding of how to grow your own food but also the process of growing your own food.',
  {'entities': [(74, 93, 'USAGE'),
    (115, 182, 'BENEFIT'),
    (259, 348, 'BENEFIT')]}),
 ('We are currently working on our outdoor provision. We have a large area and would love to be able to give it a full makeover! As a setting our ethos is to be nature inspired whilst using the curiosity approach with the added extras of colour etc. The children love spending time in the garden and enjoy planting and tending for the plants whilst also looking at the nature around. We have a wildlife garden which is in great need for a massive do over due to it being overgrown also - in here we have a pond, which was once home to many frogs!',
  {'e

In [56]:
labels = ["CONTEXT", "USAGE", "BENEFIT"]

In [57]:
nlp = spacy.load('en_core_web_md')

In [58]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

In [59]:
for label in labels: ner.add_label(label)

In [72]:
for _, annotations in TRAIN_DATA:
    for ent in annotations['entities']:
        if ent[2] not in ner.labels:
            ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()

    epochs = 100

    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches =  minibatch(TRAIN_DATA, size=15)
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example=Example.from_dict(doc, annotations)
                examples.append(example)

            nlp.update(examples, drop=0.5, losses=losses)

        print(f"Epoch: {epoch +1}, Losses: {losses}")



Epoch: 1, Losses: {'ner': 9533.999769367278}
Epoch: 2, Losses: {'ner': 3373.046681917019}
Epoch: 3, Losses: {'ner': 2111.8298322203573}
Epoch: 4, Losses: {'ner': 1268.7511834601187}
Epoch: 5, Losses: {'ner': 1260.472505828361}
Epoch: 6, Losses: {'ner': 2081.233363646942}
Epoch: 7, Losses: {'ner': 1870.1290942860255}
Epoch: 8, Losses: {'ner': 2760.632185367636}
Epoch: 9, Losses: {'ner': 4679.581114066543}
Epoch: 10, Losses: {'ner': 1795.3569531670457}
Epoch: 11, Losses: {'ner': 1373.5194162921005}
Epoch: 12, Losses: {'ner': 1627.4724870795094}
Epoch: 13, Losses: {'ner': 2155.7555929912505}
Epoch: 14, Losses: {'ner': 2706.850242773626}
Epoch: 15, Losses: {'ner': 1775.3004598389962}
Epoch: 16, Losses: {'ner': 2494.793117767037}
Epoch: 17, Losses: {'ner': 2254.270585262983}
Epoch: 18, Losses: {'ner': 2032.8792585826413}
Epoch: 19, Losses: {'ner': 1623.8611707914843}
Epoch: 20, Losses: {'ner': 2971.092258894847}
Epoch: 21, Losses: {'ner': 1791.2796182300083}
Epoch: 22, Losses: {'ner': 1042.

In [73]:
nlp.to_disk('custom-ner-model')

In [74]:
trained_nlp = spacy.load('custom-ner-model')

In [75]:
import pandas as pd

df = pd.read_csv('data/april-data.csv')

In [76]:
test_texts = df['Additional_info'].sample(n=20).to_list()

In [77]:
for text in test_texts:
    doc = trained_nlp(text)
    print(f"TEXT: {text}")
    print()
    print("ENTITIES:", [(ent.text, ent.label_) for ent in doc.ents])
    print('-'*60)
    

TEXT: we have 5 different support programmes for those with additional needs form ages 14 upwards.  We have a garden centre and this bundle would be great to be able to set up classes to get involved in nature, horticulture and offer this type of learning and connecting with nature that may not be available at home or school. We rely on funding streams which are being cut or stopped which means the variety of what we can offer becomes very limited and we appreciate all the donations, grants etc we can avail of to keep providing a meaningful experience

ENTITIES: [('we have 5 different support programmes for those with additional needs form ages 14 upwards', 'CONTEXT')]
------------------------------------------------------------
TEXT: We have a wonderful outdoor area, with trees and planters and beautiful spaces for the children to explore in early years. However, due to budget constraints a lot of our resources for outside have become quite aged and worn out. It would be lovely to inj