# Named-entity recognition with SpaCy

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [4]:
nlp = spacy.load('fr_core_news_sm')

## Fonctions

In [7]:
def test():
    """Basic test on sample sentences"""
    for sent in sentences:
        doc = nlp(sent)
        entities = []
        for ent in doc.ents:
            entities.append(f"{ent.text} ({ent.label_})")
        if entities:
            print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
        else:
            print(f"'{doc.text}' contains no entities")

In [18]:
def search(n=1000000):
    text = open("../module1/data/all.txt", encoding='latin-1').read()[:n]
    doc = nlp(text)
    people = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "PER" and len(ent.text) > 3:
            people[ent.text] += 1
    sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)
    for person, freq in sorted_people[:10]:
        print(f"{person} appears {freq} times in the corpus")

## NER sur des données de test

In [9]:
test()

'Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars' contains the following entities: Apple (ORG)
'Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs' contains no entities
'San Francisco envisage d'interdire les robots coursiers sur les trottoirs' contains the following entities: San Francisco (LOC)
'Londres est une grande ville du Royaume-Uni' contains the following entities: Londres (LOC), Royaume-Uni (LOC)
'L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe' contains the following entities: ArcelorMittal (MISC), Europe (LOC)
'Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon' contains the following entities: Apple (ORG), HomePod (MISC), Echo (MISC)
'La France ne devrait pas manquer d'électricité cet été, même en cas de canicule' contains the following entities: La France (LOC)
'Nouvelles attaques de Trump contre le maire de Londres' contains the following entities: Trump (LOC), Lo

## NER sur le corpus des bulletins communaux

In [19]:
search(n=10000)

M. Bortier appears 2 times in the corpus
H. B appears 1 times in the corpus
Discussion appears 1 times in the corpus
Adolphe Bartels appears 1 times in the corpus
Dépôt appears 1 times in the corpus
Hospices appears 1 times in the corpus
M. Partoes appears 1 times in the corpus
É V A L U A T I O N DES T appears 1 times in the corpus
Total Solde appears 1 times in the corpus
Bortier appears 1 times in the corpus
