In [23]:
! pip install spacy datasets




In [24]:
from datasets import load_dataset
import spacy
from spacy.tokens import Doc, DocBin
from datasets import load_dataset


dataset = load_dataset("conll2003", trust_remote_code=True)

# Get NER tag names
label_list = dataset["train"].features["ner_tags"].feature.names

def create_spacy_docs(dataset_split, nlp):
    doc_bin = DocBin()
    for example in dataset_split:
        tokens = example["tokens"]
        ner_tags = example["ner_tags"]

        # Create Doc from tokens
        doc = Doc(nlp.vocab, words=tokens)
        ents = []
        i = 0
        while i < len(ner_tags):
            tag = label_list[ner_tags[i]]
            if tag.startswith("B-"):
                start = i
                ent_label = tag[2:]
                i += 1
                while i < len(ner_tags) and label_list[ner_tags[i]] == f"I-{ent_label}":
                    i += 1
                end = i
                span = doc.char_span(doc[start].idx, doc[end - 1].idx + len(doc[end - 1]), label=ent_label, alignment_mode="expand")
                if span:
                    ents.append(span)
            else:
                i += 1
        doc.ents = ents
        doc_bin.add(doc)
    return doc_bin


nlp = spacy.blank("en")

create_spacy_docs(dataset["train"], nlp).to_disk("train.spacy")
create_spacy_docs(dataset["validation"], nlp).to_disk("dev.spacy")


In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
! python -m spacy debug config config.cfg

In [None]:
! python -m spacy init fill-config config.cfg config.cfg


In [25]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy


[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     37.89    0.00    0.00    0.00    0.00
  0     200        143.81   3498.24   49.71   52.37   47.31    0.50
  0     400        188.17   2175.37   70.04   69.98   70.09    0.70
  0     600        270.97   1952.99   73.73   74.21   73.26    0.74
  0     800        321.76   2081.55   77.45   77.99   76.93    0.77
  0    1000        399.24   2132.90   81.20   82.26   80.16    0.81
  1    1200        401.13   1991.80   83.33   83.36   83.31    0.83
  1    1400        487.94   1749.08   84.16   84.05   84.26    0.84
  1    1600        583.61   2062.40   85.27   86.06   84.48    0.85
  2    1800        680.09   1790.29   86.57   

In [26]:
import spacy

nlp = spacy.load("output/model-best")
doc = nlp("Barack Obama visited Berlin in 2008.")
print([(ent.text, ent.label_) for ent in doc.ents])


[('Barack Obama', 'PER'), ('Berlin', 'LOC')]


In [27]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example

nlp = spacy.load("output/model-best")
doc_bin = DocBin().from_disk("dev.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

examples = [Example(predicted=nlp(doc.text), reference=doc) for doc in docs]

scorer = nlp.evaluate(examples)


In [28]:
scorer

{'token_acc': None,
 'token_p': None,
 'token_r': None,
 'token_f': None,
 'ents_p': 0.870531896699162,
 'ents_r': 0.8566139347021205,
 'ents_f': 0.8635168377300874,
 'ents_per_type': {'ORG': {'p': 0.8163418290854573,
   'r': 0.8120805369127517,
   'f': 0.8142056074766354},
  'LOC': {'p': 0.9063205417607223,
   'r': 0.874251497005988,
   'f': 0.8899972291493489},
  'MISC': {'p': 0.8927294398092968,
   'r': 0.8123644251626898,
   'f': 0.8506530380465643},
  'PER': {'p': 0.8654048370136698,
   'r': 0.8935939196525515,
   'f': 0.8792735042735043}},
 'speed': 21992.495780026773}