# Trainging a customer NER tagger from a blank SpaCy model

Using a small training dataset which I have manually annotated.

In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [22]:
import json
f = open('ingredients_train.json')
TRAIN_DATA = json.load(f)

In [10]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./ingredients_train.spacy") # save the docbin object

100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 578.00it/s]

Skipping entity





In [4]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
! python -m spacy train config.cfg --output ./ --paths.train ./ingredients_train.spacy --paths.dev ./ingredients_train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-07-22 13:23:02,730] [INFO] Set up nlp object from config
[2022-07-22 13:23:02,746] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-07-22 13:23:02,752] [INFO] Created vocabulary
[2022-07-22 13:23:02,753] [INFO] Finished initializing nlp object
[2022-07-22 13:23:03,060] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     32.14    0.00    0.00    0.00    0.00
 13     200         50.61   1716.49  100.00  100.00  100.00    1.00
 30     400         38.47     27.81   99.70   99.40  100.00    1.00
 50     600         13.46      6.57  100.00  100.00  100.00    1.00
 78     800          0.19      0.11  100.00  100.00  100.00    

In [2]:
nlp_ner = spacy.load("./model-best")

In [3]:
sample_text = '''
1/2 a ripe avocado 
1 piece of toast 
Olive oil
Chilli flakes 
Spicy peanut butter 

Either make your own - recipe below - or add a sprinkling of chilli flakes, paprika and maple to a jar of peanut butter, it’s 👌👌👌

Homemade spicy peanut butter 
250g redskin peanuts
½ - 1 teaspoon chili flakes (depending on how much spice you like)
1 teaspoon smoked paprika
1 tablespoon maple syrup
Pinch of flaky sea salt
Makes enough sauce for 4 sundaes
100g coconut sugar 
50ml water 
200g chopped pineapple 
2 pieces star anise 
100ml plant based milk 

To serve: 
Vanilla ice cream 
A handful of crushed, toasted peanuts 
'''

doc = nlp_ner(sample_text)

In [4]:
spacy.displacy.render(doc, style="ent", jupyter=True)

In [5]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

1/2 1 4 QUANTITY
avocado 12 19 INGREDIENT
1 21 22 QUANTITY
toast 32 37 INGREDIENT
Olive oil 39 48 INGREDIENT
Chilli flakes 49 62 INGREDIENT
peanut butter 70 83 INGREDIENT
sprinkling 133 143 MEASUREMENT
chilli flakes 147 160 INGREDIENT
peanut butter 192 205 INGREDIENT
👌👌👌 212 215 INGREDIENT
250g 247 251 QUANTITY
redskin peanuts 252 267 INGREDIENT
½ 268 269 QUANTITY
1 272 273 QUANTITY
teaspoon 274 282 MEASUREMENT
chili flakes 283 295 INGREDIENT
1 335 336 QUANTITY
teaspoon 337 345 MEASUREMENT
paprika 353 360 INGREDIENT
1 361 362 QUANTITY
tablespoon 363 373 MEASUREMENT
syrup 380 385 INGREDIENT
Pinch 386 391 MEASUREMENT
salt 405 409 INGREDIENT
Makes enough 410 422 INGREDIENT
4 433 434 QUANTITY
sundaes 435 442 INGREDIENT
100g 443 447 QUANTITY
coconut sugar 448 461 INGREDIENT
50ml 463 467 QUANTITY
water 468 473 INGREDIENT
200g 475 479 QUANTITY
2 499 500 QUANTITY
100ml 520 525 QUANTITY
based milk 532 542 INGREDIENT
handful 577 584 MEASUREMENT
crushed 588 595 INGREDIENT


In [21]:
for i in range(30):
    print(doc[i].text, doc[i].ent_iob_, doc[i].ent_type_)


 O 
1/2 B QUANTITY
a O 
ripe O 
avocado B INGREDIENT

 O 
1 B QUANTITY
piece O 
of O 
toast B INGREDIENT

 O 
Olive B INGREDIENT
oil I INGREDIENT

 O 
Chilli B INGREDIENT
flakes I INGREDIENT

 O 
Spicy O 
peanut B INGREDIENT
butter I INGREDIENT


 O 
Either O 
make O 
your O 
own O 
- O 
recipe O 
below O 
- O 
or O 


In [23]:
TRAIN_DATA

{'classes': ['INGREDIENT', 'QUANTITY', 'MEASUREMENT'],
 'annotations': [['Sweetcorn fritters:', {'entities': []}],
  ['ingredients\n- 2x 200g cans of sweetcorn, drained\n- 4 sliced spring onions\n- 4 tablespoons of gram flour\n- 1 teaspoon of paprika\n- Handful of chopped fresh coriander\n- 1 ½ tablespoons of water\n- juice of ½ a lime\n- 1 tablespoon of olive oil\n- a pinch of salt\n- Pinch of salt & pepper',
   {'entities': [[14, 21, 'QUANTITY'],
     [30, 39, 'INGREDIENT'],
     [51, 52, 'QUANTITY'],
     [60, 73, 'INGREDIENT'],
     [76, 77, 'QUANTITY'],
     [78, 89, 'MEASUREMENT'],
     [93, 103, 'INGREDIENT'],
     [106, 107, 'QUANTITY'],
     [108, 116, 'MEASUREMENT'],
     [120, 127, 'INGREDIENT'],
     [130, 137, 'MEASUREMENT'],
     [155, 164, 'INGREDIENT'],
     [167, 170, 'QUANTITY'],
     [171, 182, 'MEASUREMENT'],
     [186, 191, 'INGREDIENT'],
     [203, 204, 'QUANTITY'],
     [207, 211, 'INGREDIENT'],
     [214, 215, 'QUANTITY'],
     [216, 226, 'MEASUREMENT'],
     [2

In [30]:
nlp2 = spacy.load('en_core_web_sm')
nlp2.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [32]:
ner = nlp2.get_pipe('ner')

In [35]:
nlp2(sample_text).ents

(1/2, Chilli, 250, ½ - 1, 1 tablespoon, 4, 100, 50ml, 200, 2, 100ml)

In [36]:
spacy.displacy.render(nlp2(sample_text), style="ent", jupyter=True)