# Format for the training data:
## TRAIN_DATA = [ (TEXT AS A STRING, {“entities”: [(START, END, LABEL)]}) ]

In [1]:
# Creating the training data in specified format
#Import the requisite library
import spacy

#Build upon the spaCy Small Model
nlp = spacy.load("en_core_web_sm")

#Sample text
text = "Treblinka is a small village in Poland. Wikipedia notes that Treblinka is not large."

corpus = []

doc = nlp(text)
for sent in doc.sents:
    corpus.append(sent.text)

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the EntityRuler
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns
patterns = [
                {"label": "GPE", "pattern": "Treblinka"}
            ]

ruler.add_patterns(patterns)

In [2]:

TRAIN_DATA = []

#iterate over the corpus again
for sentence in corpus:
    doc = nlp(sentence)

    #remember, entities needs to be a dictionary in index 1 of the list, so it needs to be an empty list
    entities = []

    #extract entities
    for ent in doc.ents:

        #appending to entities in the correct format
        entities.append([ent.start_char, ent.end_char, ent.label_])

    TRAIN_DATA.append([sentence, {"entities": entities}])

print (TRAIN_DATA)

[['Treblinka is a small village in Poland.', {'entities': [[0, 9, 'GPE']]}], ['Wikipedia notes that Treblinka is not large.', {'entities': [[21, 30, 'GPE']]}]]


Converting the training data to binary file

In [19]:
import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    #print(output_path.split("/")[:-1][0])
    Path(output_path.split("/")[:-1][0]).mkdir(parents=True, exist_ok=True)
    db.to_disk(output_path)

In [20]:
# Converting TRAIN_DATA to binary files
# Note: For example taken the same TRAIN_DATA as train and valid data.
convert("en", TRAIN_DATA, "data/train.spacy")
convert("en", TRAIN_DATA, "data/valid.spacy")

data
data


Note: Created the base_config.cfg file for ner component and hardware CPU from https://spacy.io/usage/training

In [23]:
!python -m spacy init fill-config data/base_config.cfg data/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
data\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
