# Named-entity recognition with SpaCy

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [2]:
nlp = spacy.load('fr_core_news_sm')

## Fonctions

In [3]:
def test():
    """Basic test on sample sentences"""
    for sent in sentences:
        doc = nlp(sent)
        entities = []
        for ent in doc.ents:
            entities.append(f"{ent.text} ({ent.label_})")
        if entities:
            print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
        else:
            print(f"'{doc.text}' contains no entities")

In [4]:
def searchPeople(n=1000000):
    text = open("1909.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    people = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "PER" and len(ent.text) > 3:
            people[ent.text] += 1
    sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)
    for person, freq in sorted_people[:20]:
        print(f"{person} appears {freq} times in the corpus")

In [5]:
def searchLocation(n=1000000):
    text = open("1909.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    location = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "LOC" and len(ent.text) > 3:
            location[ent.text] += 1
    sorted_location = sorted(location.items(), key=lambda kv: kv[1], reverse=True)
    for location, freq in sorted_location[:20]:
        print(f"{location}, a location, appears {freq} times in the corpus")

In [6]:
def searchOrganisation(n=1000000):
    text = open("1909.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    organisation= defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text) > 3:
            organisation[ent.text] += 1
    sorted_organisation = sorted(organisation.items(), key=lambda kv : kv[1], reverse=True)
    for org, freq in sorted_organisation[:20]:
        print(f"{org}, an organisation, appears {freq} times in the corpus")

## NER sur des données de test

In [7]:
test()

'Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars' contains the following entities: Apple (ORG)
'Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs' contains no entities
'San Francisco envisage d'interdire les robots coursiers sur les trottoirs' contains the following entities: San Francisco (LOC)
'Londres est une grande ville du Royaume-Uni' contains the following entities: Londres (LOC), Royaume-Uni (LOC)
'L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe' contains the following entities: ArcelorMittal (MISC), Europe (LOC)
'Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon' contains the following entities: Apple (ORG), HomePod (MISC), Echo (MISC)
'La France ne devrait pas manquer d'électricité cet été, même en cas de canicule' contains the following entities: La France (LOC)
'Nouvelles attaques de Trump contre le maire de Londres' contains the following entities: Trump (LOC), Lo

## NER sur le corpus des bulletins communaux

In [8]:
searchPeople(n=1000000)

Messieurs appears 112 times in the corpus
Subside appears 69 times in the corpus
Camille H appears 56 times in the corpus
Totaux appears 37 times in the corpus
Bauwens appears 35 times in the corpus
Hospices appears 25 times in the corpus
Camille Huysmans appears 17 times in the corpus
M. le Bourgmestre appears 17 times in the corpus
Anspach appears 15 times in the corpus
Puissant appears 13 times in the corpus
Renvoi appears 13 times in the corpus
Recettes appears 13 times in the corpus
Maes appears 11 times in the corpus
Echevin Lemonnier appears 10 times in the corpus
Dassonville appears 10 times in the corpus
Fabrique appears 10 times in the corpus
Theodor appears 9 times in the corpus
Van den Nest appears 9 times in the corpus
Tuberculose appears 9 times in the corpus
Fontaine appears 8 times in the corpus


In [9]:
searchLocation(n=1000000)

Bruxelles, a location, appears 159 times in the corpus
Collège, a location, appears 118 times in the corpus
Bourgmestre, a location, appears 69 times in the corpus
Ville, a location, appears 68 times in the corpus
Etat, a location, appears 57 times in the corpus
E c h, a location, appears 41 times in the corpus
Frais, a location, appears 30 times in the corpus
la Ville, a location, appears 29 times in the corpus
Wauwermans, a location, appears 24 times in the corpus
Bruxellois, a location, appears 23 times in the corpus
Subsides, a location, appears 23 times in the corpus
Gouvernement, a location, appears 21 times in the corpus
Bourse, a location, appears 21 times in the corpus
Province, a location, appears 21 times in the corpus
Bauwens, a location, appears 20 times in the corpus
Total, a location, appears 19 times in the corpus
Rires, a location, appears 18 times in the corpus
Masculins, a location, appears 18 times in the corpus
Conrardy, a location, appears 17 times in the corpus
E

In [10]:
searchOrganisation(n=1000000)

Conseil, an organisation, appears 163 times in the corpus
Section, an organisation, appears 43 times in the corpus
Conseil général des hospices, an organisation, appears 22 times in the corpus
Collège, an organisation, appears 19 times in the corpus
MOINS, an organisation, appears 11 times in the corpus
Conseil c, an organisation, appears 10 times in the corpus
Conseil communal, an organisation, appears 8 times in the corpus
Collège vous, an organisation, appears 8 times in the corpus
Caisse communale, an organisation, appears 7 times in the corpus
NATURE, an organisation, appears 7 times in the corpus
Echevin M, an organisation, appears 7 times in the corpus
ACTE, an organisation, appears 7 times in the corpus
PLUS, an organisation, appears 7 times in the corpus
Société, an organisation, appears 6 times in the corpus
Administration, an organisation, appears 6 times in the corpus
Comité, an organisation, appears 6 times in the corpus
MESSIEURS, an organisation, appears 5 times in the c