In [3]:
import re
import pandas as pd
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [52]:
data_file = "../data/bio/fr.bio/fr.sentences.bio"
tag_mapping = {
    "O": 0,
    "B-LOC-DEP": 1,
    "B-LOC-ARR": 2,
    "I-LOC-DEP": 3,
    "I-LOC-ARR": 4
}

with open(data_file) as f:
    data = f.read()
    
tokens = []
ner_tags = []
spans = []
text = []
sentences = re.split(r'(?<=[.!?] O)\n', data)

for sentence in sentences:
    words = []
    tags = []
    this_sentence_spans = []
    word_tag_pairs = sentence.split("\n")
    
    for pair in word_tag_pairs:
        if pair.split():
            (word, tag) = pair.split(" ")
            if tag != "O":
                start_offset = len(" ".join(words)) + (1 if words else 0)
                end_offset = start_offset + len(word)
                this_sentence_spans.append((start_offset, end_offset, tag))
            words.append(word)
            tags.append(tag)
        
    sentence_text = " ".join(words)
    text.append(sentence_text)
    
    tokens.append(words)    
    ner_tags.append(tags)
    spans.append(this_sentence_spans)

In [20]:
import spacy

nlp = spacy.load("fr_core_news_sm")

In [54]:
data = []

for (index, sentence_text) in enumerate(text):
    doc = nlp(sentence_text)
    ents = []
    for start, end, label in spans[index]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        ents.append(span)
        
    doc.ents = ents        
    data.append(doc)
    
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [55]:
train_db = DocBin()
test_db = DocBin()

for doc in train_data:
    train_db.add(doc)
for doc in test_data:
    test_db.add(doc)
    
train_db.to_disk("tor_ner_train.spacy")                
test_db.to_disk("tor_ner_test.spacy")                

In [62]:
train("spacy_config_ner.cfg", output_path="spacy")

[38;5;2m✔ Created output directory: spacy[0m
[38;5;4mℹ Saving to output directory: spacy[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  ------  ------  ------  ------
  0       0          0.00        78.14       269.21     56.22    59.13    12.69    10.28     0.49    0.00    0.00    0.00    0.23
  0     200        491.26      4051.27      7550.68   2599.34    91.04    88.04    82.20    90.48   90.41   90.47   90.35    0.89
  0     400        547.35      3478.36      5221.82    635.04    92.57    87.33    82.54    94.13   86.93   87.12   86.74    0.88
  0     600        860.68      3656.24      5376.61    993.76    93.57    88.3

In [73]:
evaluate("spacy/model-last", "tor_ner_test.spacy")

{'token_acc': 0.9850639263950293,
 'token_p': 0.7914027960329788,
 'token_r': 0.9159838191059019,
 'token_f': 0.8491482235292231,
 'tag_acc': 0.9593681439227731,
 'sents_p': 0.9381294964028777,
 'sents_r': 0.9213377296278851,
 'sents_f': 0.9296577946768061,
 'dep_uas': 0.8940712205205299,
 'dep_las': 0.8661730246508577,
 'dep_las_per_type': {'nsubj': {'p': 0.9899598393574297,
   'r': 0.8257956448911222,
   'f': 0.9004566210045661},
  'root': {'p': 0.9549597855227882,
   'r': 0.8389072067828545,
   'f': 0.8931795386158475},
  'xcomp': {'p': 0.9415730337078652,
   'r': 0.8275181040157998,
   'f': 0.8808689558514367},
  'det': {'p': 0.9935248887090247,
   'r': 0.9250188394875659,
   'f': 0.9580487804878048},
  'obj': {'p': 0.9031890660592256,
   'r': 0.8786703601108034,
   'f': 0.8907610221847796},
  'case': {'p': 0.9796151647932939,
   'r': 0.9180503481521157,
   'f': 0.9478341013824885},
  'nmod': {'p': 0.8415692191625802,
   'r': 0.7925399644760213,
   'f': 0.8163190633004025},
  'obl:

In [74]:
ft_nlp = spacy.load("spacy/model-last")

doc = ft_nlp("Je veux aller de Montpellier à Paris")
for ent in doc.ents:
    print(ent.text, ent.label_, ent.start, ent.end)

Montpellier B-LOC-DEP 4 5
Paris B-LOC-ARR 6 7
