In [10]:
!python -m spacy init config base_config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
base_config.cfg
You can now add your data and train your pipeline:
python -m spacy train base_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [13]:
import spacy
from datasets import load_dataset
from spacy.tokens import DocBin
from spacy.training import Example

# Load dataset
dataset = load_dataset("conll2003")

# Load blank English pipeline
nlp = spacy.blank("en")

def convert_to_spacy(data, output_path):
    db = DocBin()
    for item in data:
        tokens = item["tokens"]
        ner_tags = item["ner_tags"]
        tags = [dataset["train"].features["ner_tags"].feature.int2str(tag) for tag in ner_tags]

        doc = nlp.make_doc(" ".join(tokens))
        ents = []
        start = 0

        for token, tag in zip(tokens, tags):
            token_start = doc.text.find(token, start)
            token_end = token_start + len(token)
            if tag.startswith("B-"):
                ent_start = token_start
                ent_end = token_end
                ent_label = tag[2:]
            elif tag.startswith("I-") and 'ent_start' in locals():
                ent_end = token_end
            else:
                if tag != "O" and 'ent_start' in locals():
                    span = doc.char_span(ent_start, ent_end, label=ent_label)
                    if span:
                        ents.append(span)
                    del ent_start, ent_end, ent_label
            start = token_end

        # Final check to catch last entity
        if 'ent_start' in locals():
            span = doc.char_span(ent_start, ent_end, label=ent_label)
            if span:
                ents.append(span)
            del ent_start, ent_end, ent_label

        doc.ents = ents
        db.add(doc)

    db.to_disk(output_path)

# Convert train/dev/test
convert_to_spacy(dataset["train"], "train.spacy")
convert_to_spacy(dataset["validation"], "dev.spacy")
convert_to_spacy(dataset["test"], "test.spacy")


In [14]:
!python -m spacy train config.cfg \
  --output ./output \
  --paths.train ./train.spacy \
  --paths.dev ./dev.spacy


[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     41.28    0.00    0.00    0.00    0.00
  0     200         77.78   2173.66   27.33   62.30   17.50    0.27
  0     400        106.83   1526.14   52.30   68.17   42.42    0.52
  0     600        153.42   1661.20   58.55   68.14   51.32    0.59
  0     800        238.54   1813.44   60.53   76.37   50.13    0.61
  0    1000        287.46   2125.64   64.73   68.36   61.46    0.65
  1    1200        349.57   2309.44   67.29   74.91   61.07    0.67
  1    1400        502.90   2330.11   66.77   66.74   66.79    0.67
  1    1600        510.11   2685.65   69.99   77.67   63.69    0.7

In [15]:
nlp = spacy.load('./output/model-best')

In [16]:
!python -m spacy evaluate ./output/model-best ./test.spacy

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   75.99 
NER R   64.55 
NER F   69.81 
SPEED   17171 

[1m

           P       R       F
PER    61.80   53.03   57.08
ORG    76.05   66.70   71.07
LOC    84.33   75.45   79.64
MISC   68.30   43.47   53.12



In [20]:
text = "trump usa Germany hi John ."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

John PER
