In [1]:
import spacy

In [2]:
# ensure to use this on your terminal python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_trf")

In [3]:
nlp.pipe_names

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
doc = nlp("Red blouse are good")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

In [5]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)



## Fine tuning Spacy on Custom Dataset

#### Import synthetic data

In [9]:
import json

In [10]:
with open ("synthetic_ner_training_data_fixed.json", "r") as f:
    data = json.load(f)

In [11]:
data["examples"]

[{'id': 'ca-1',
  'content': "list all products at Yemi's Fashion House",
  'annotations': [{'start': 21, 'end': 41, 'tag_name': 'store'}]},
 {'id': 'ca-2',
  'content': 'find for Fufu across all stores',
  'annotations': [{'start': 9, 'end': 13, 'tag_name': 'product'}]},
 {'id': 'ca-3',
  'content': 'list all products at Oga Market',
  'annotations': [{'start': 21, 'end': 31, 'tag_name': 'store'}]},
 {'id': 'ca-5',
  'content': 'check my order with id 01314',
  'annotations': [{'start': 23, 'end': 28, 'tag_name': 'orderId'}]},
 {'id': 'ca-7',
  'content': 'list all products at Mama Nkechi’s Groceries',
  'annotations': [{'start': 21, 'end': 44, 'tag_name': 'store'}]},
 {'id': 'ca-11',
  'content': 'make a reservation at Mama Nkechi’s Groceries for 2024-12-15 at 12:45',
  'annotations': [{'start': 22, 'end': 45, 'tag_name': 'store'},
   {'start': 50, 'end': 60, 'tag_name': 'date'},
   {'start': 64, 'end': 69, 'tag_name': 'time'}]},
 {'id': 'ca-12',
  'content': 'make a reservation at M

#### Pre processing data for spacy

In [12]:
training_data = []
for example in data["examples"]:
    temp_dict = {}
    temp_dict["text"] = example["content"]
    temp_dict["entities"] = []
    for annotation in example["annotations"]:
        start = annotation["start"]
        end = annotation["end"]
        label = annotation["tag_name"]
        temp_dict["entities"].append((start, end, label))
    training_data.append(temp_dict)

print(training_data[0])

{'text': "list all products at Yemi's Fashion House", 'entities': [(21, 41, 'store')]}


#### Converting training data to SpaCy Docbin format

In [14]:
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans


nlp = spacy.blank("en")
doc_bin = DocBin()

for training_example in tqdm(training_data):
    text = training_example["text"]
    labels = training_example["entities"]
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

100%|████████████████████████████████████████████████████████████████████████| 62/62 [00:00<00:00, 4375.76it/s]


#### Intialise spacy

In [15]:
!python -m spacy init fill-config Base_config.cfg config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


#### Training Spacy

In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./train.spacy