# Geoparsing


The goal of this notebook is to apply geoparsing to the corpus : 
- build a referential for continents and countries in French
- retrive LOC entites from a text
- implement functions to identify continents, countries and cities from a list of LOC entities
- structure the set of entities as follows : continents/countries/regions/cities /miscellaneous.

__Imports__

In [1]:
import geonamescache
import geopy
import pandas as pd

import spacy
try: 
    print("fr_core_news_sm loaded")
    nlp = spacy.load("fr_core_news_sm") # load pre-trained models for French
except:
    print("fr loaded")
    nlp=spacy.load('fr') # fr calls fr_core_news_sm 
    


fr_core_news_sm loaded


In [2]:
# data
news_df=pd.read_csv("labeled_articles_clean.csv")

news_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,year,title,text,url,geo,topic,clean_text,clean_title,pre_title,pre_text,geo_code,topic_code
0,0,0,1988,Tintin dans l'espace,Trois semaines à bord de la station soviétique...,https://www.lexpress.fr/informations/tintin-da...,fr,sc,semain bord station soviet jean-loup chrétien ...,tintin espac,tintin espac,semain bord station soviet jean-loup chrétien ...,3,5
1,1,1,1988,Le faux suicide de Robert Boulin,1979 : son corps est découvert en forêt de Ram...,https://www.lexpress.fr/actualite/politique/le...,fr,ju,corp découvert forêt rambouillet fauss pist né...,faux suicid robert boulin,faux suicid robert boulin,corp découvert forêt rambouillet fauss pist né...,3,2
2,2,2,1988,Des pierres contre les certitudes,"Rideaux de fer baissés, silhouettes furtives, ...",https://www.lexpress.fr/actualite/monde/proche...,me,po,rideau baiss silhouet furtiv jérusalem arab to...,pierr contr certitud,pierr contr certitud,rideau baiss silhouet furtiv jérusalem arab to...,5,4
3,3,3,1988,"Otages: soudain, mercredi soir...",""" Je lui ai dit: ""Ça suffit"", et j'ai raccroch...",https://www.lexpress.fr/informations/otages-so...,me,ju,suff raccroch trop souvent échaud trop souvent...,otag soudain mercred soir,otag soudain mercred soir,suff raccroch trop souvent échaud trop souvent...,5,2
4,4,4,1988,Les secrets de la planète rouge,"S'il existe, dans le système solaire, un seul ...",https://www.lexpress.fr/actualite/sciences/les...,spa,sc,exist system solair seul endroit exobiolog dés...,secret planet roug,secret planet roug,exist system solair seul endroit exobiolog dés...,6,5


In [3]:
# data from opendata.gouv

country_df = pd.read_csv("./liste_197_etats_2020.csv", encoding = "ISO-8859-1", delimiter=';')
country_df['ARTICLE'].fillna("", inplace = True) 
country_df.NOM = country_df.NOM.apply(lambda x : x.lower()) 
country_df.NOM_ALPHA = country_df.NOM.apply(lambda x : x.lower()) 
country_df.NOM_LONG = country_df.NOM.apply(lambda x : x.lower()) 
country_df.head(10)

Unnamed: 0,NOM,NOM_ALPHA,CODE,ARTICLE,NOM_LONG,CAPITALE
0,afghanistan,afghanistan,AFG,l',afghanistan,Kaboul
1,afrique du sud,afrique du sud,ZAF,l',afrique du sud,Prétoria
2,albanie,albanie,ALB,l',albanie,Tirana
3,algérie,algérie,DZA,l',algérie,Alger
4,allemagne,allemagne,DEU,l',allemagne,Berlin
5,andorre,andorre,AND,l',andorre,Andorre-la-Vieille
6,angola,angola,AGO,l',angola,Luanda
7,antigua-et-barbuda,antigua-et-barbuda,ATG,,antigua-et-barbuda,Saint John's
8,arabie saoudite,arabie saoudite,SAU,l',arabie saoudite,Riyad
9,argentine,argentine,ARG,l',argentine,Buenos Aires


## 1. Build tools

### 1.1 Build Continent and Country lists in French

using geonamescache

In [4]:
gc = geonamescache.GeonamesCache()

In [5]:
# Basic examples for getting started

continents = gc.get_continents()
print(continents.keys())
print(continents['EU'])

countries = gc.get_countries()
print(countries.keys())
print(countries['GB'])


dict_keys(['AF', 'AS', 'EU', 'NA', 'OC', 'SA', 'AN'])
{'lng': '9.14062', 'geonameId': 6255148, 'timezone': {'gmtOffset': 1, 'timeZoneId': 'Europe/Vaduz', 'dstOffset': 2}, 'bbox': {'east': 41.73303985595703, 'south': 27.6377894797159, 'north': 80.76416015625, 'west': -24.532675386662543, 'accuracyLevel': 0}, 'toponymName': 'Europe', 'asciiName': 'Europe', 'astergdem': 439, 'fcl': 'L', 'population': 741000000, 'wikipediaURL': 'en.wikipedia.org/wiki/Europe', 'adminName5': '', 'srtm3': 443, 'adminName4': '', 'adminName3': '', 'alternateNames': [{'isPreferredName': True, 'name': '유럽', 'lang': 'ko'}, {'name': 'ยุโรป', 'lang': 'th'}, {'name': 'ヨーロッパ', 'lang': 'ja'}, {'name': 'an Eoraip', 'lang': 'ga'}, {'name': 'Avrupa', 'lang': 'tr'}, {'name': 'Châu Âu', 'lang': 'vi'}, {'name': 'Eiropa', 'lang': 'lv'}, {'isColloquial': True, 'name': 'El viejo continente', 'lang': 'es'}, {'name': 'Eropa', 'lang': 'id'}, {'name': 'Eurohpá', 'lang': 'se'}, {'isPreferredName': True, 'name': 'Euroopa', 'lang': 'e

In [6]:
# Build dictionnary
# keys = continent names in French
# values = dictionnaries with keys 'continantCode', 'country_codes', 'country_code_iso3', 'country_names'

my_continents_fr = dict() # continents.keys()
my_continent_codes = dict() # basic continent_code : french name

for continent_code in continents.keys():
    for dico in continents[continent_code]['alternateNames']:
        # get the french name ?
        if dico['lang']=='fr':
            name_fr = dico['name']
            cont_dico = {'continentCode':continent_code}
            my_continents_fr[name_fr]=cont_dico
            my_continent_codes[continent_code]=name_fr
            if name_fr!='Antarctique':
                my_continents_fr[name_fr]['country_codes']=continents[continent_code]['cc2'].split(',')
            else:
                my_continents_fr[name_fr]['country_codes']=[]

def add_article(article, nom):
    if article in ['le', 'la', 'les']:
        res = article+' '+nom
    elif article == "l'":
        res = article+nom
    else:
        res = nom
    return res

# map country name to unique country name (ex: la france --> france)

country_name_to_ref = dict()

for key in my_continents_fr.keys():
    my_continents_fr[key]['country_code_iso3']=[]
    my_continents_fr[key]['country_names']=[]
    set_noms_fr = set()
    for country_code in my_continents_fr[key]['country_codes']:
        iso3 = countries[country_code]['iso3']
        my_continents_fr[key]['country_code_iso3'].append(iso3)
        
        try: 
            nom = country_df[country_df['CODE']==iso3]['NOM'].tolist()[0]
            nom_alpha = country_df[country_df['CODE']==iso3]['NOM_ALPHA'].tolist()[0]
            nom_long = country_df[country_df['CODE']==iso3]['NOM_LONG'].tolist()[0]
            article = country_df[country_df['CODE']==iso3]['ARTICLE'].tolist()[0]
            nom_article = add_article(article,nom)
            nom_alpha_article = add_article(article,nom_alpha)
            set_noms_fr.add(nom)
            set_noms_fr.add(nom_alpha)
            set_noms_fr.add(nom_long)
            set_noms_fr.add(nom_article)
            set_noms_fr.add(nom_alpha_article)
            country_name_to_ref[nom]=nom
            country_name_to_ref[nom_alpha]=nom
            country_name_to_ref[nom_article]=nom
            country_name_to_ref[nom_article_alpha]=nom
            country_name_to_ref[nom_long]=nom
        except:
            #print(countries[country_code])
            pass
        set_noms_fr.add(countries[country_code]['name'])
            
    my_continents_fr[key]['country_names']=list(set_noms_fr)

my_continents_fr

{'Afrique': {'continentCode': 'AF',
  'country_codes': ['AO',
   'BF',
   'BI',
   'BJ',
   'BW',
   'CD',
   'CF',
   'CG',
   'CI',
   'CM',
   'CV',
   'DJ',
   'DZ',
   'EG',
   'ER',
   'ET',
   'GA',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GW',
   'KE',
   'KM',
   'LR',
   'LS',
   'LY',
   'MA',
   'MG',
   'ML',
   'MR',
   'MU',
   'MW',
   'MZ',
   'NA',
   'NE',
   'NG',
   'RE',
   'RW',
   'SC',
   'SD',
   'SH',
   'SL',
   'SN',
   'SO',
   'SS',
   'ST',
   'SZ',
   'TD',
   'TG',
   'TN',
   'TZ',
   'UG',
   'YT',
   'ZA',
   'ZM',
   'ZW'],
  'country_code_iso3': ['AGO',
   'BFA',
   'BDI',
   'BEN',
   'BWA',
   'COD',
   'CAF',
   'COG',
   'CIV',
   'CMR',
   'CPV',
   'DJI',
   'DZA',
   'EGY',
   'ERI',
   'ETH',
   'GAB',
   'GHA',
   'GMB',
   'GIN',
   'GNQ',
   'GNB',
   'KEN',
   'COM',
   'LBR',
   'LSO',
   'LBY',
   'MAR',
   'MDG',
   'MLI',
   'MRT',
   'MUS',
   'MWI',
   'MOZ',
   'NAM',
   'NER',
   'NGA',
   'REU',
   'RWA',
   'SYC',
   'SDN',
  

In [7]:
# map country name to continent
country_to_cont_dic = dict()

for cont in my_continents_fr.keys():
    for country_name in my_continents_fr[cont]['country_names']:
        country_to_cont_dic[country_name.lower()]=cont.lower()
        
print(country_to_cont_dic)

{'republic of the congo': 'afrique', 'gambia': 'afrique', 'mali': 'afrique', "côte d'ivoire": 'afrique', 'sierra leone': 'afrique', 'kénya': 'afrique', 'le lésotho': 'afrique', 'guinée': 'afrique', 'sao tome and principe': 'afrique', 'mayotte': 'afrique', 'cameroun': 'afrique', 'niger': 'afrique', 'malawi': 'afrique', 'rwanda': 'afrique', 'ivory coast': 'afrique', 'le soudan': 'afrique', 'maroc': 'afrique', 'togo': 'afrique', 'le ghana': 'afrique', 'ethiopia': 'afrique', 'le libéria': 'afrique', 'benin': 'afrique', 'la mauritanie': 'afrique', 'sénégal': 'afrique', 'liberia': 'afrique', 'le bénin': 'afrique', 'la gambie': 'afrique', 'les seychelles': 'afrique', 'somalia': 'afrique', 'la guinée': 'afrique', 'bénin': 'afrique', 'le sénégal': 'afrique', 'la zambie': 'afrique', 'sudan': 'afrique', 'tchad': 'afrique', 'mauritanie': 'afrique', 'south africa': 'afrique', 'le mozambique': 'afrique', 'gambie': 'afrique', "l'égypte": 'afrique', 'sao tomé-et-principe': 'afrique', 'zambia': 'afriqu

In [8]:
# example : country name to ref name
country_name_to_ref["la france"]

'france'

In [9]:
# Final lists for continents and countries

list_countries_fr = [] # contains country names in French and in English

for key in my_continents_fr.keys():
    list_countries_fr+=[string.lower() for string in my_continents_fr[key]['country_names']]

print(list_countries_fr)

list_continents_fr = [key.lower() for key in my_continents_fr.keys()] # continent names in French
print(list_continents_fr)

print(my_continent_codes) # basic dic : code : name

['republic of the congo', 'gambia', 'mali', "côte d'ivoire", 'sierra leone', 'kénya', 'le lésotho', 'guinée', 'sao tome and principe', 'mayotte', 'cameroun', 'niger', 'malawi', 'rwanda', 'ivory coast', 'le soudan', 'niger', 'maroc', 'togo', 'le ghana', 'ethiopia', 'le libéria', 'benin', 'la mauritanie', 'sénégal', 'liberia', 'le bénin', 'la gambie', 'les seychelles', 'somalia', 'la guinée', 'bénin', 'le sénégal', 'la zambie', 'sudan', 'tchad', 'mauritanie', 'south africa', 'le mozambique', 'gambie', "l'égypte", 'sao tomé-et-principe', 'zambia', 'eswatini', 'le burundi', 'madagascar', 'la tunisie', 'gabon', 'les comores', 'senegal', 'la sierra leone', 'le congo', 'eritrea', 'mozambique', 'la somalie', 'république démocratique du congo', 'central african republic', 'guinée équatoriale', 'tanzanie', 'le eswatini', 'morocco', 'cabo verde', 'rwanda', 'le cap-vert', 'ghana', 'tanzania', 'zimbabwe', 'la guinée-bissao', 'tunisie', 'lesotho', 'le botswana', 'afrique du sud', 'botswana', 'djibou

__Bonus : Referential for Planets, Satellites and Stars__

In [10]:
# import data
planet_df = pd.read_csv("./planets.csv")
star_df = pd.read_csv("./star_names.csv", delimiter = ';')
satellite_df = pd.read_csv("./satellites.csv")


In [11]:

planet_df.head()

Unnamed: 0,planet,mass,diameter,density,gravity,escape_velocity,rotation_period,length_of_day,distance_from_sun,perihelion,...,orbital_period,orbital_velocity,orbital_inclination,orbital_eccentricity,obliquity_to_orbit,mean_temperature,surface_pressure,number_of_moons,has_ring_system,has_global_magnetic_field
0,Mercury,0.33,4879,5427,3.7,4.3,1407.6,4222.6,57.9,46.0,...,88.0,47.4,7.0,0.205,0.034,167,0,0,No,Yes
1,Venus,4.87,12104,5243,8.9,10.4,-5832.5,2802.0,108.2,107.5,...,224.7,35.0,3.4,0.007,177.4,464,92,0,No,No
2,Earth,5.97,12756,5514,9.8,11.2,23.9,24.0,149.6,147.1,...,365.2,29.8,0.0,0.017,23.4,15,1,1,No,Yes
3,Mars,0.642,6792,3933,3.7,5.0,24.6,24.7,227.9,206.6,...,687.0,24.1,1.9,0.094,25.2,-65,0.01,2,No,No
4,Jupiter,1898.0,142984,1326,23.1,59.5,9.9,9.9,778.6,740.5,...,4331.0,13.1,1.3,0.049,3.1,-110,Unknown*,79,Yes,Yes


In [12]:
planet_names = planet_df.planet.apply(lambda x : x.lower()).tolist()

planet_names += ['mercure', 'terre', 'la terre', 'saturne', 'pluton'] # alright, pluto is not a real planet

print(planet_names)

['mercury', 'venus', 'earth', 'mars', 'jupiter', 'saturn', 'uranus', 'neptune', 'pluto', 'mercure', 'terre', 'la terre', 'saturne', 'pluton']


In [13]:
star_df.head()

Unnamed: 0,IAU Name,Designation,ID,Constellation,#,WDS J,V magnitude,RA (J2000),Dec (J2000),Approval Date
0,Brachium,HR 5603,σ,Lib,A,15041-2517,3.25,226.017567,-25.281961,2017-09-05
1,Capella,HR 1708,α,Aur,Aa,05167+4600,0.08,79.172328,45.997991,2016-06-30
2,Atria,HR 6217,α,TrA,-,16487-6902,1.91,252.166229,-69.027712,2016-07-20
3,Cebalrai,HR 6603,β,Oph,-,-,2.76,265.868136,4.5673,2016-08-21
4,Hatysa,HR 1899,ι,Ori,Aa,05354-0555,2.8,83.858258,-5.909901,2017-09-05


In [14]:
star_names = star_df['IAU Name'].apply(lambda x : x.lower()).tolist()
print(star_names[:10])

# note : we would need the constellation names in plain text too !

['brachium', 'capella', 'atria', 'cebalrai', 'hatysa', 'gomeisa', 'chara', 'denebola', 'enif', 'chalawan']


In [15]:
satellite_df.head() #groupby(['planet']).count()

Unnamed: 0,planet,name,gm,radius,density,magnitude,albedo
0,Earth,Moon,4902.801±0.001,1737.5±0.1,3.344±0.005,-12.74,0.12
1,Mars,Phobos,0.0007112±0.0000010,11.1±0.15,1.872±0.076,11.4±0.2,0.071±0.012
2,Mars,Deimos,0.0000985±0.0000024,6.2±0.18,1.471±0.166,12.45±0.05,0.068±0.007
3,Jupiter,Io,5959.916±0.012,1821.6±0.5,3.528±0.006,5.02±0.03,0.63±0.02
4,Jupiter,Europa,3202.739±0.009,1560.8±0.5,3.013±0.005,5.29±0.02,0.67±0.03


We would like to have a hierarchical approch (like system > constellation / planet > satellites etc.) but in our first iteration we are going to use just a basic spatial vocabulary in one list. Adaptation of reference zones : "space" is added amongst continents.

In [16]:
satellite_names = satellite_df.name.apply(lambda x : x.lower()).tolist()
print(satellite_names[:10])

['moon', 'phobos', 'deimos', 'io', 'europa', 'ganymede', 'callisto', 'amalthea', 'himalia', 'elara']


In [17]:
my_space_voc = satellite_names + planet_names + star_names
print("Number of words in ref list: ", len(my_space_voc))

Number of words in ref list:  521


__Utility function for zones__

Continents + space

In [43]:
# utility function : from a list of entities, look for continent names and country names
# store them in a dictionnary

my_ref_zones = ['afrique', 'l\'afrique', 'asie', 'l\'asie', 'europe', 'l\'europe', 
                'l\'amérique du nord', 'amérique du nord', 'amérique', 'l\'amérique', 
                'océanie', 'l\'océanie', 'l\'amérique du sud', 'l\'amérique latine', 'l\'antarctique',
                'amérique du sud', 'amérique latine', 'antarctique', 'moyen-orient', 'le moyen-orient', "l'espace"]

cont_names_to_ref = {'afrique':'afrique', 
                     'l\'afrique':'afrique', 
                     'asie':'asie', 
                     'l\'asie':'asie', 
                     'europe':'europe', 
                     'l\'europe':'europe', 
                     'l\'amérique du nord':'amérique du nord', 
                     'amérique du nord':'amérique du nord', 
                     'amérique':'amérique', 
                     'l\'amérique':'amérique', 
                     'océanie':'océanie', 
                     'l\'océanie':'océanie', 
                     'l\'amérique du sud':'amérique du sud', 
                     'l\'amérique latine':'amérique latine', 
                     'l\'antarctique':'antarctique',
                     'amérique du sud':'amérique du sud', 
                     'amérique latine':'amérique latine', 
                     'antarctique':'antarctique', 
                     'moyen-orient':'moyen-orient', 
                     'le moyen-orient':'moyen-orient', 
                     "l'espace":"espace", 
                     "espace":"espace"}

def continent_info(input_list, ref = my_ref_zones):
    """
    not used
    """
    geo_dic={key:[] for key in ['cont', 'country', 'country_code', 'state', 'city', 'misc']}
    for string in set(input_list):
        if string in ref:
            geo_dic['cont'].append(string)
    return geo_dic

def country_info(input_list, ref = list_countries_fr):
    """
    not_used
    """    
    geo_dic={key:[] for key in ['cont', 'country', 'country_code', 'state', 'city', 'misc']}
    for string in set(input_list):
        if string in ref:
            geo_dic['country'].append(string)
        else:
            pass
    return geo_dic


#print(continent_info(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
#print(continent_info(['afrique', 'asie', 'afrique']))
#print(country_info(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
#print(country_info(['afrique', 'asie', 'afrique']))


def continent_only(input_list, ref = my_ref_zones, ref_space = my_space_voc):
    cont_list=set()
    for string in set(input_list):
        if string in ref:
            cont_list.add(cont_names_to_ref[string])
        elif string in ref_space:
            cont_list.add("espace")
            cont_list.add(string)
        else:
            pass
    return cont_list

def country_only(input_list, ref = list_countries_fr):
    country_list=set()
    for string in set(input_list):
        if string in ref:
            try : 
                country_list.add(country_name_to_ref[string])
            except: 
                country_list.add(string)
        else:
            pass
    return country_list

print(continent_only(['israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'terre', 'jordanie', 'liban', 'beyrouth', 'syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
print(continent_only(['mars', 'l\'afrique', 'afrique', 'asie', 'afrique', 'amérique latine']))
print(country_only(['nouvelle-zélande', 'israël', 'genève', 'new york', 'abou ammar', 'alger', 'israël', 'aviv', 'koweït', 'palestine', 'la terre', 'jordanie', 'liban', 'beyrouth', 'la syrie', 'israël', 'liban', 'damas', 'jérusalem', 'syrie', 'beyrouth', 'tunis', 'egypte', 'israël', 'syrie', 'damas', 'tripoli', 'iran', 'jordanie', 'egypte', 'palestine', 'israël', 'genève', 'israël', 'rusé', 'habile', 'new york', 'kenya', 'amérique du sud', 'genève', 'israël', 'new york', 'jérusalem', 'diplomatique', 'washington']))
print(country_only(['l\'afrique', 'asie', 'afrique', 'amérique latine']))


{'amérique du sud', 'terre', 'espace'}
{'espace', 'amérique latine', 'afrique', 'asie', 'mars'}
{'jordanie', 'koweït', 'nouvelle-zélande', 'iran', 'kenya', 'syrie', 'liban', 'israël'}
set()


### 1.2 Write utility functions to retrieve info about cities

using geopy

In [19]:
from geonamescache.mappers import country

iso_to_cont_mapper = country(from_key='iso', to_key='continentcode')

# test
print(my_continent_codes[iso_to_cont_mapper('DE')]) 

print(countries['DE'])

Europe
{'geonameid': 2921044, 'name': 'Germany', 'iso': 'DE', 'iso3': 'DEU', 'isonumeric': 276, 'fips': 'GM', 'continentcode': 'EU', 'capital': 'Berlin', 'areakm2': 357021, 'population': 81802257, 'tld': '.de', 'currencycode': 'EUR', 'currencyname': 'Euro', 'phone': '49', 'postalcoderegex': '^(\\d{5})$', 'languages': 'de', 'neighbours': 'CH,PL,NL,DK,BE,CZ,LU,FR,AT'}


In [20]:
def city_info(city_list):
    from geopy.geocoders import Nominatim
    from geopy.exc import GeocoderTimedOut    
    geopy.geocoders.options.default_user_agent = "my-application2"
    geolocator = Nominatim(timeout=2)

    loc_list=[]
    loc_dic = {key:[] for key in ['cont', 'country', 'country_code', 'state', 'city', 'misc']}

    for city in city_list : 
        print('\n', city)
        try:
            location = geolocator.geocode(city, addressdetails=True, language="fr")
            if location:
                print(location.latitude, location.longitude)
                loc_list.append(location)
                
                address = location.raw['address'] 
                print(address)
                
                if len(set(['shop', 'amenity', 'building', 'neighbourhood', 'leisure', 'hamlet', 'locality', 'isolated_dwelling'])&address.keys())>0:
                    pass     # discard the entity because of high probability of mistake           
                  
                elif 'city' in address.keys(): # city, or road in a city
                    loc_dic['misc'].append(address.get('tourism', '').lower() )
                    loc_dic['misc'].append(address.get('road', '').lower() )  
                    loc_dic['city'].append(address.get('city', '').lower() )
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )
                elif 'town' in address.keys(): # town, or road in a town 
                    loc_dic['misc'].append(address.get('tourism', '').lower() )
                    loc_dic['misc'].append(address.get('road', '').lower() )  
                    loc_dic['city'].append(address.get('town', '').lower()  )                                   
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )
                elif 'place' in address.keys():
                    loc_dic['state'].append(address.get('place', '').lower())
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                    
                elif 'region' in address.keys():
                    loc_dic['misc'].append(address.get('road', '').lower() )  
                    loc_dic['state'].append(address.get('region', '').lower())
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                    
                elif 'village' in address.keys():
                    loc_dic['misc'].append(address.get('road', '').lower() )  
                    loc_dic['misc'].append(address.get('tourism', '').lower() )
                    loc_dic['misc'].append(address.get('village', '').lower() )
                    loc_dic['city'].append(address.get('municipality', '').lower()  )
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                    
                elif 'waterway' in address.keys():
                    loc_dic['misc'].append(address.get('waterway', '').lower() )
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )                                    
                else : # name of a state already, or a "boundary", or a country
                    loc_dic['state'].append(address.get('state', '') .lower() )
                    loc_dic['state'].append(address.get('boundary', '') .lower() ) # region-like zone
                    loc_dic['country'].append(address.get('country', '').lower()  )
                    cc = address.get('country_code', '').upper()
                    loc_dic['country_code'].append(cc)
                    loc_dic['cont'].append(my_continent_codes[iso_to_cont_mapper(cc)].lower() )
                   
        except GeocoderTimedOut as e:
            print("Error: geocode failed on input %s with message %s" %(city, e))
            loc_dic['misc'].append(city.lower())

    loc_dic = {key : set([string for string in loc_dic[key] if len(string)>0]) for key in loc_dic.keys()}
    print('\n---- List of Locations ---')
    print(loc_list)
    print('\n---- Dictionnary ---')
    print(loc_dic)
    return loc_dic

city_list=['Verrières-le-Buisson', "Munich", "Palais de l'Elysée", "Reichstag", 'france', 'la seine', 'cisjordanie', 'bretagne', 'chabot', 'bourgogne', 'boulevard maillot', 
           'trac', 'valium', 'nezvran', 'franc-maçon', 'pr lebreton', 'moyen-orient', 
           'gaza', 'sympathie', 'schultz', 'la terre', 'genou']

test = city_info(city_list)
test


 Verrières-le-Buisson
48.7467819 2.2653844
{'town': 'Verrières-le-Buisson', 'municipality': 'Palaiseau', 'county': 'Essonne', 'state': 'Île-de-France', 'country': 'France', 'postcode': '91370', 'country_code': 'fr'}

 Munich
48.1371079 11.5753822
{'city': 'Munich', 'state': 'Bavière', 'country': 'Allemagne', 'country_code': 'de'}

 Palais de l'Elysée
48.87037435 2.316068734550804
{'tourism': "Palais de l'Élysée", 'road': 'Avenue de Marigny', 'neighbourhood': 'Quartier de la Madeleine', 'suburb': 'Paris 8e Arrondissement', 'city': 'Paris', 'municipality': 'Paris', 'county': 'Paris', 'state': 'Île-de-France', 'country': 'France', 'postcode': '75008', 'country_code': 'fr'}

 Reichstag
52.4676201 13.5280284
{'tourism': 'Reichstag', 'house_number': '81', 'road': 'An der Wuhlheide', 'suburb': 'Oberschöneweide', 'borough': 'Treptow-Köpenick', 'city': 'Berlin', 'district': 'Rixdorf', 'state': 'Berlin', 'postcode': '12459', 'country': 'Allemagne', 'country_code': 'de'}

 france
46.603354 1.888

{'cont': {'asie', 'europe'},
 'country': {'allemagne', 'france', 'palestinian territory'},
 'country_code': {'DE', 'FR', 'PS'},
 'state': {'bande de gaza',
  'bavière',
  'berlin',
  'bourgogne',
  'bretagne',
  'cisjordanie',
  'judea et samaria',
  'île-de-france'},
 'city': {'berlin', 'munich', 'neuilly-sur-seine', 'verrières-le-buisson'},
 'misc': {'an der wuhlheide', 'boulevard maillot', 'la seine', 'reichstag'}}

In [21]:
# summary of remaining issues 

city_info(["zorglub", "amérique latine", "mars", "soleil"])


 zorglub

 amérique latine

 mars
45.022172 4.3221548
{'village': 'Mars', 'municipality': 'Tournon-sur-Rhône', 'county': 'Ardèche', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'postcode': '07320', 'country_code': 'fr'}

 soleil
46.1178584 5.0808015
{'hamlet': 'Le Soleil', 'village': 'Saint-André-le-Bouchoux', 'municipality': 'Bourg-en-Bresse', 'county': 'Ain', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'postcode': '01240', 'country_code': 'fr'}

---- List of Locations ---
[Location(Mars, Tournon-sur-Rhône, Ardèche, Auvergne-Rhône-Alpes, France métropolitaine, 07320, France, (45.022172, 4.3221548, 0.0)), Location(Le Soleil, Saint-André-le-Bouchoux, Bourg-en-Bresse, Ain, Auvergne-Rhône-Alpes, France métropolitaine, 01240, France, (46.1178584, 5.0808015, 0.0))]

---- Dictionnary ---
{'cont': {'europe'}, 'country': {'france'}, 'country_code': {'FR'}, 'state': {'auvergne-rhône-alpes'}, 'city': {'tournon-sur-rhône'}, 'misc': {'mars'}}


{'cont': {'europe'},
 'country': {'france'},
 'country_code': {'FR'},
 'state': {'auvergne-rhône-alpes'},
 'city': {'tournon-sur-rhône'},
 'misc': {'mars'}}

__Full pipeline__

For getting structured info about geographic locations from a list of extracted entities

1. continents
2. countries
3. cities on what remains --> higher level fields are added too
4. fusion of sets

In [22]:
def geo_info(entity_list, ref_cont = my_ref_zones, ref_countries = list_countries_fr): # to add : , **kwargs
    cont_set = continent_only(entity_list, ref=ref_cont)
    country_set = country_only(entity_list, ref=ref_countries)
    for c in country_set:
        cont_set.add(country_to_cont_dic[c])
    city_list = list(set(entity_list)-(cont_set|country_set))
    geo_dico = city_info(city_list)
    geo_dico['cont']=geo_dico['cont']|cont_set
    geo_dico['country']=geo_dico['country']|country_set
    return geo_dico

# test

test = ['amérique latine', 'australie', 'france', 'paris', 'reichstag', 'mars']

geo_info(test)


 reichstag
52.4676201 13.5280284
{'tourism': 'Reichstag', 'house_number': '81', 'road': 'An der Wuhlheide', 'suburb': 'Oberschöneweide', 'borough': 'Treptow-Köpenick', 'city': 'Berlin', 'district': 'Rixdorf', 'state': 'Berlin', 'postcode': '12459', 'country': 'Allemagne', 'country_code': 'de'}

 paris
48.8566969 2.3514616
{'city': 'Paris', 'municipality': 'Paris', 'county': 'Paris', 'state': 'Île-de-France', 'country': 'France', 'country_code': 'fr'}

---- List of Locations ---
[Location(Reichstag, 81, An der Wuhlheide, Oberschöneweide, Treptow-Köpenick, Berlin, Rixdorf, Berlin, 12459, Allemagne, (52.4676201, 13.5280284, 0.0)), Location(Paris, Île-de-France, France métropolitaine, France, (48.8566969, 2.3514616, 0.0))]

---- Dictionnary ---
{'cont': {'europe'}, 'country': {'allemagne', 'france'}, 'country_code': {'FR', 'DE'}, 'state': {'île-de-france', 'berlin'}, 'city': {'paris', 'berlin'}, 'misc': {'reichstag', 'an der wuhlheide'}}


{'cont': {'amérique latine', 'espace', 'europe', 'mars', 'océanie'},
 'country': {'allemagne', 'australie', 'france'},
 'country_code': {'DE', 'FR'},
 'state': {'berlin', 'île-de-france'},
 'city': {'berlin', 'paris'},
 'misc': {'an der wuhlheide', 'reichstag'}}

### 1.3 Entity Extraction

Using spacy 

In [23]:
def to_lower(input_text):
    if isinstance(input_text, pd.Series):
        output=input_text.apply(lambda x: x.lower())
    elif isinstance(input_text, list):
        output=[[w.lower() for w in L] for L in input_text]
    elif isinstance(input_text, str):
        output=input_text.lower()
    else:
         output = ''   
    return output

to_lower([['GFB'], ['RBTB', 'ttt']])

[['gfb'], ['rbtb', 'ttt']]

In [24]:
# as a function

def entity_extractor(text_low, ent_type='LOC'):
    """
    Input:
    ------
    text_low : pandas series containing strings to process, or list of lists containing strings. 
    !!! Words must be lower case already
    ent_type : type of entity to extract, LOC by default
    
    Output:
    ------
    ent_list : list of list of extracted entities, same length as the input text_series
    """
    
    nlp = spacy.load("fr") # reload
    # Create pipe containing all titles
    bodies=list(nlp.pipe(text_low, disable=["tagger", "parser"]) ) 
    
    ent_list = []
    for doc in bodies: 
        ent_list.append([ent.text for ent in doc.ents if ent.label_ == ent_type])

    return ent_list

# test
LOC_list = entity_extractor(pd.Series(["le groenland est un pays charmant"]))
print(LOC_list[:10])

[['groenland']]


### 1.4 Comparison of Geo Dicts

Idea : 
- ensemble comparison using Jaccard similarity
- weighted sum, according to the concept's position in the geographical hierarchy

In practice:
- Dice similarity
- Jaccard similarity
- coeff de recouvrement --> not suitable because gives top similarity to a singleton containing any element from the other set!

Operations on sets : | for union, & for intersection, – for difference, ^ for symmetric difference.

In [25]:
def jaccard_sim(set1, set2):
    return len(set1&set2)/len(set1|set2)

def dice_sim(set1, set2):
    return 2*len(set1&set2)/(len(set1)+len(set2))

def recov_coeff(set1, set2):
    return len(set1&set2)/min([len(set1), len(set2)])

# test

set1 = {'Bade-Wurtemberg','Île-de-France', 'Bretagne', 'Guerrero',}

set2 = {'Bade-Wurtemberg','Île-de-France', 'Bavière', 'Berlin', 'Aquitaine', }

set3 = {'Bade-Wurtemberg','Île-de-France', 'Bavière'}

print(jaccard_sim(set1, set2))
print(jaccard_sim(set2, set3))
print(jaccard_sim(set1, set3))

print(dice_sim(set1, set2))
print(dice_sim(set2, set3))
print(dice_sim(set1, set3))

print(recov_coeff(set1, set2))
print(recov_coeff(set2, set3))
print(recov_coeff(set1, set3))

0.2857142857142857
0.6
0.4
0.4444444444444444
0.75
0.5714285714285714
0.5
1.0
0.6666666666666666


In [26]:
def geo_sim(dico1, dico2, weights = {'cont':1, 'country':2, 'state':3, 'city':6, 'misc':1}, similarity=dice_sim):
    """
    Input:
    ------
    dico1 : dict with keys 'cont', 'country','country_code', 'state', 'city', 'misc' 
    dico2 : idem
    similarity : str, 'jaccard or 'dice'
    
    Output:
    ------
    sim : float, similarity score for geographic locations contained in the dictionnaries
    """
            
    dico_sim = dict()
    for key in ['cont', 'country', 'state', 'city', 'misc']:
        dico_sim[key]=similarity(dico1[key], dico2[key])
    
    w_sum = sum(list(weights.values()))
    weights = {key : weights[key]/w_sum for key in weights.keys()}
    
    sim=sum([dico_sim[key]*weights[key] for key in dico_sim.keys()])
    
    return sim
            



In [27]:
# test 

dico1 = {'cont': {'Amérique du Nord', 'Europe'},
 'country': {'Allemagne', 'France', 'Mexique'},
 'country_code': {'DE', 'FR', 'MX'},
 'state': {'Bade-Wurtemberg',
  'Bavière',
  'Berlin',
  'Bretagne',
  'Guerrero',
  'Île-de-France'},
 'city': {'Acapulco',
  'Berlin',
  'Munich',
  'Paris',
  'Quimper',
  'Stuttgart',
  'Verrières-le-Buisson'},
 'misc': {"Palais de l'Élysée", 'Reichstag'}}

dico2 = {'cont': {'Amérique du Nord', 'Europe', 'Afrique'},
 'country': {'Allemagne', 'France', 'Mexique'},
 'state': {'Bade-Wurtemberg',
  'Bavière',
  'Guerrero',
  'Île-de-France'},
 'city': {'Acapulco',
  'Berlin',
  'Munich',
  'Paris',
  'Stuttgart',
  'Verrières-le-Buisson'},
 'misc': set()}

dico3 = {'cont': { 'Europe', 'Afrique'},
 'country': {'France', 'Union Soviétique'},
 'state': {
  'Île-de-France'},
 'city': {'Stuttgart',
  'Verrières-le-Buisson'},
 'misc': {'station MIR', 'espace'}}

print(geo_sim(dico1, dico1))
print(geo_sim(dico1, dico2))
print(geo_sim(dico1, dico3))

1.0
0.82603550295858
0.3710622710622711


## 2. Apply geoparsing to the news articles

using the utility functions defined above

In [28]:
LOC_stopwords = ['etat', 'état', 'pays', 'continent', 'endroit', 'lieu', 'de france', 'état-', 'major', 'état-major']

def remove_mistakes(input_list, stopwords=LOC_stopwords):
    return [w for w in input_list if w not in stopwords]

# test 

remove_mistakes(['etat', 'usa', 'barbades'])

['usa', 'barbades']

In [29]:
# test 1

dic_list=[]
for ent_list in LOC_list[:10]:
    ent_list = remove_mistakes(ent_list)
    geo_dic = geo_info(ent_list)
    dic_list.append(geo_dic)
    


 groenland
77.6192349 -42.8125967
{'country': 'Groenland', 'country_code': 'gl'}

---- List of Locations ---
[Location(Groenland, (77.6192349, -42.8125967, 0.0))]

---- Dictionnary ---
{'cont': {'amérique du nord'}, 'country': {'groenland'}, 'country_code': {'GL'}, 'state': set(), 'city': set(), 'misc': set()}


In [30]:
# test 2 - waterways

dic_list=[]
for ent_list in [['la seine', 'garonne', 'rhône']]:
    geo_dic = geo_info(ent_list)
    dic_list.append(geo_dic)



 rhône
45.8802348 4.564533629559522
{'county': 'Rhône', 'state_district': 'Circonscription départementale du Rhône', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'country_code': 'fr'}

 garonne
44.2161598 0.5195403
{'waterway': 'La Garonne', 'country': 'France', 'postcode': '33190', 'country_code': 'fr'}

 la seine
48.7308806 2.444754
{'waterway': 'La Seine', 'country': 'France', 'postcode': '78230', 'country_code': 'fr'}

---- List of Locations ---
[Location(Rhône, Circonscription départementale du Rhône, Auvergne-Rhône-Alpes, France métropolitaine, France, (45.8802348, 4.564533629559522, 0.0)), Location(La Garonne, France métropolitaine, 33190, France, (44.2161598, 0.5195403, 0.0)), Location(La Seine, France métropolitaine, 78230, France, (48.7308806, 2.444754, 0.0))]

---- Dictionnary ---
{'cont': {'europe'}, 'country': {'france'}, 'country_code': {'FR'}, 'state': {'auvergne-rhône-alpes'}, 'city': set(), 'misc': {'la garonne', 'la seine'}}


## 3. Filter and compare on the basis of geoparsing

Idea : 
* input article, by the user --[geoparsing]--> geo_dico --> dataframe continent/country
* filter on continents and/or countries
* compute geo_dico similarity between the input article and all remaining articles
* keep k1 best
* compute text similarity using word embeddings
* retrieve k2 best for reco

+ test the opposite way : first, text similarity, second, geo_dico similarity

In [46]:
#to do : implement geoparsing_str(input_text) qui renvoie geo_dico
# éventuellement, geoparsing_series plus adapté pour passer à l'échelle 

def geoparsing_str(input_text):
    """
    input_text : string - must be a clean string! 
    """
    # preprocessing
    input_text = to_lower(input_text)
    split_text = input_text.split()
    # first run with default voc : requires a finer preprocessing --> preprocesssed text as input
    countries = country_only(split_text)
    cont = continent_only(split_text)
    # extract entities
    LOC_list = entity_extractor(pd.Series([input_text]))
    print(LOC_list)
    # remove false positives
    LOC_list = remove_mistakes(LOC_list[0])
    print(LOC_list)
    # get hierarchical geo info
    geo_dico = geo_info(LOC_list)
    geo_dico["cont"]=geo_dico["cont"]|cont
    geo_dico["country"]=geo_dico["country"]|countries
    return geo_dico

def geoparsing_series(input_series):
    """
    input_series : pandas series containing strings
    """
    # preprocessing
    input_text = to_lower(input_text)
    # extract entities
    LOC_list = entity_extractor(input_series)
    # geoparsing
    dic_list=[]
    for ent_list in LOC_list[:10]:
        ent_list = remove_mistakes(ent_list) # remove false positives
        geo_dico = geo_info(ent_list) # get hierarchical geo info
        dic_list.append(geo_dico)
    return dic_list
    

In [47]:
# spacy is not performing cery well here : doesn't catch nouvelle-zélande !
test = "La Nouvelle-Zélande est gouvernée par une femme. Mon lapin blanc a visité la ville de Grenoble au coeur des montagnes. J'ai hâte de prendre l'avion pour l'Australie. Jusqu'alors je n'avais visité que l'Europe. Qui sait quelles surprises me réservent ces contrées lointaines ?"
geoparsing_str(test)

[['grenoble', 'australie', 'europe']]
['grenoble', 'australie', 'europe']

 grenoble
45.1875602 5.7357819
{'city': 'Grenoble', 'municipality': 'Grenoble', 'county': 'Isère', 'state': 'Auvergne-Rhône-Alpes', 'country': 'France', 'country_code': 'fr'}

---- List of Locations ---
[Location(Grenoble, Isère, Auvergne-Rhône-Alpes, France métropolitaine, France, (45.1875602, 5.7357819, 0.0))]

---- Dictionnary ---
{'cont': {'europe'}, 'country': {'france'}, 'country_code': {'FR'}, 'state': {'auvergne-rhône-alpes'}, 'city': {'grenoble'}, 'misc': set()}


{'cont': {'europe', 'océanie'},
 'country': {'australie', 'france', 'nouvelle-zélande'},
 'country_code': {'FR'},
 'state': {'auvergne-rhône-alpes'},
 'city': {'grenoble'},
 'misc': set()}