# Named-entity recognition with SpaCy

In [1]:
from collections import defaultdict
import sys

import spacy
from spacy.lang.fr.examples import sentences

Pour installer les modèles Spacy en français : `python -m spacy download fr_core_news_sm`

In [2]:
nlp = spacy.load('fr_core_news_sm')

## Fonctions

In [3]:
def test():
    """Basic test on sample sentences"""
    for sent in sentences:
        doc = nlp(sent)
        entities = []
        for ent in doc.ents:
            entities.append(f"{ent.text} ({ent.label_})")
        if entities:
            print(f"'{doc.text}' contains the following entities: {', '.join(entities)}")
        else:
            print(f"'{doc.text}' contains no entities")

In [4]:
def searchPeople(n=1000000):
    text = open("1909.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    people = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "PER" and len(ent.text) > 3:
            people[ent.text] += 1
    sorted_people = sorted(people.items(), key=lambda kv: kv[1], reverse=True)
    for person, freq in sorted_people[:20]:
        print(f"{person} appears {freq} times in the corpus")

In [8]:
def searchLocation(n=1000000):
    text = open("1909.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    location = defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "LOC" and len(ent.text) > 3:
            location[ent.text] += 1
    sorted_location = sorted(location.items(), key=lambda kv: kv[1], reverse=True)
    for location, freq in sorted_location[:20]:
        print(f"{location}, a location, appears {freq} times in the corpus")

In [10]:
def searchOrganisation(n=1000000):
    text = open("1909.txt", encoding='utf-8').read()[:n]
    doc = nlp(text)
    organisation= defaultdict(int)
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text) > 3:
            organisation[ent.text] += 1
    sorted_organisation = sorted(organisation.items(), key=lambda kv : kv[1], reverse=True)
    for org, freq in sorted_organisation[:20]:
        print(f"{org}, an organisation, appears {freq} times in the corpus")

## NER sur des données de test

In [17]:
test()

'Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars' contains the following entities: Apple (ORG)
'Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs' contains no entities
'San Francisco envisage d'interdire les robots coursiers sur les trottoirs' contains the following entities: San Francisco (LOC)
'Londres est une grande ville du Royaume-Uni' contains the following entities: Londres (LOC), Royaume-Uni (LOC)
'L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe' contains the following entities: ArcelorMittal (MISC), Europe (LOC)
'Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon' contains the following entities: Apple (ORG), HomePod (MISC), Echo (MISC)
'La France ne devrait pas manquer d'électricité cet été, même en cas de canicule' contains the following entities: La France (LOC)
'Nouvelles attaques de Trump contre le maire de Londres' contains the following entities: Trump (LOC), Lo

## NER sur le corpus des bulletins communaux

In [6]:
searchPeople(n=100000)

Bauwens appears 12 times in the corpus
Messieurs appears 10 times in the corpus
Camille H appears 10 times in the corpus
Echevin Lemonnier appears 9 times in the corpus
Anspach appears 3 times in the corpus
Camille Huysmans appears 3 times in the corpus
Renvoi appears 3 times in the corpus
Madou appears 3 times in the corpus
Jamaer appears 2 times in the corpus
C ' é t appears 2 times in the corpus


In [9]:
searchLocation(n=100000)

Collège, a location, appears 20 times in the corpus
Bourse, a location, appears 14 times in the corpus
E c h, a location, appears 12 times in the corpus
Bourgmestre, a location, appears 12 times in the corpus
place M, a location, appears 12 times in the corpus
Bruxellois, a location, appears 9 times in the corpus
Dillens, a location, appears 7 times in the corpus
Conrardy, a location, appears 7 times in the corpus
gare M, a location, appears 7 times in the corpus
Bruxelles, a location, appears 6 times in the corpus
Echevin M, a location, appears 6 times in the corpus
Bauwens, a location, appears 5 times in the corpus
Echevin L, a location, appears 5 times in the corpus
Gouvernement, a location, appears 4 times in the corpus
Hanssens, a location, appears 4 times in the corpus
place R, a location, appears 4 times in the corpus
Senne, a location, appears 4 times in the corpus
place S, a location, appears 3 times in the corpus
la Ville, a location, appears 3 times in the corpus
rue H, a lo

In [None]:
searchOrganisation(n=1000000)