# Trainging a customer NER tagger from a blank SpaCy model

Using a small training dataset which I have manually annotated.

In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [7]:
import json
f = open('ingredients_train.json')
TRAIN_DATA = json.load(f)

In [10]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./ingredients_train.spacy") # save the docbin object

100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 578.00it/s]

Skipping entity





In [4]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
! python -m spacy train config.cfg --output ./ --paths.train ./ingredients_train.spacy --paths.dev ./ingredients_train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-07-22 13:23:02,730] [INFO] Set up nlp object from config
[2022-07-22 13:23:02,746] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-07-22 13:23:02,752] [INFO] Created vocabulary
[2022-07-22 13:23:02,753] [INFO] Finished initializing nlp object
[2022-07-22 13:23:03,060] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     32.14    0.00    0.00    0.00    0.00
 13     200         50.61   1716.49  100.00  100.00  100.00    1.00
 30     400         38.47     27.81   99.70   99.40  100.00    1.00
 50     600         13.46      6.57  100.00  100.00  100.00    1.00
 78     800          0.19      0.11  100.00  100.00  100.00    

In [12]:
nlp_ner = spacy.load("./model-best")

In [13]:
sample_text = '''
1/2 a ripe avocado 
1 piece of toast 
Olive oil
Chilli flakes 
Spicy peanut butter 

Either make your own - recipe below - or add a sprinkling of chilli flakes, paprika and maple to a jar of peanut butter, it’s 👌👌👌

Homemade spicy peanut butter 
250g redskin peanuts
½ - 1 teaspoon chili flakes (depending on how much spice you like)
1 teaspoon smoked paprika
1 tablespoon maple syrup
Pinch of flaky sea salt
Makes enough sauce for 4 sundaes
100g coconut sugar 
50ml water 
200g chopped pineapple 
2 pieces star anise 
100ml plant based milk 

To serve: 
Vanilla ice cream 
A handful of crushed, toasted peanuts 
'''

doc = nlp_ner(sample_text)

In [14]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter