# Geolocation

In [1]:
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from geolocation.exceptions import ApiClientException
from geolocation.main import GoogleMaps
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

from googlemaps_key import KEY

# Autores

In [2]:
%%time

autores_df = pd.read_csv('autores7.csv', header=0)
autores_df = autores_df.fillna('')

CPU times: user 180 ms, sys: 4.41 ms, total: 185 ms
Wall time: 184 ms


In [3]:
autores_df.shape

(61864, 8)

In [4]:
autores_df.head()

Unnamed: 0,TITULO,FECHA_PUB,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,NOMBRE_LOCALIZADO,CAPITAL_AUTOR,CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,2010,Spain,Martina Badstuber,Germany,Martina Badstuber,Berlin,Madrid
1,Galleta para perros,2008,Spain,Helen Cooper,United Kingdom,Helen Cooper,London,Madrid
2,Codrilo,2009,Spain,Roberto Aliaga Sánchez,Spain,Roberto Aliaga,Madrid,Madrid
3,"Félix, : el coleccionista de miedos",2009,Spain,Fina Casalderrey,Spain,Fina Casalderrey,Madrid,Madrid
4,En la laguna más profunda,2011,Colombia,Oscar Collazos,Colombia,Oscar Collazos,Bogotá,Bogotá


## Limpiando datos

In [5]:
autores_df = autores_df[
    (autores_df['CAPITAL_AUTOR'] != 'DESCONOCIDO') &
    (autores_df['CAPITAL_EJEMPLAR'] != 'DESCONOCIDO')
]

In [6]:
len(autores_df)

61857

In [7]:
autores_df.head()

Unnamed: 0,TITULO,FECHA_PUB,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,NOMBRE_LOCALIZADO,CAPITAL_AUTOR,CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,2010,Spain,Martina Badstuber,Germany,Martina Badstuber,Berlin,Madrid
1,Galleta para perros,2008,Spain,Helen Cooper,United Kingdom,Helen Cooper,London,Madrid
2,Codrilo,2009,Spain,Roberto Aliaga Sánchez,Spain,Roberto Aliaga,Madrid,Madrid
3,"Félix, : el coleccionista de miedos",2009,Spain,Fina Casalderrey,Spain,Fina Casalderrey,Madrid,Madrid
4,En la laguna más profunda,2011,Colombia,Oscar Collazos,Colombia,Oscar Collazos,Bogotá,Bogotá


# Geocoordinates

In [8]:
autores_df['CAPITAL_EJEMPLAR'].nunique()

108

In [9]:
autores_df['CAPITAL_AUTOR'].nunique()

147

In [10]:
%%time

places = set()
for i, x in autores_df.iterrows():
    pais_ejemplar = x['PAIS_EJEMPLAR']
    capital_ejemplar = x['CAPITAL_EJEMPLAR']
    places.add((capital_ejemplar, pais_ejemplar))
    pais_autor = x['PAIS_AUTOR']
    capital_autor = x['CAPITAL_AUTOR']
    places.add((capital_autor, pais_autor))

CPU times: user 9.86 s, sys: 4.38 ms, total: 9.86 s
Wall time: 9.86 s


In [11]:
len(places)

193

In [12]:
capital_countries_d = defaultdict(set)
for a, b in places:
    capital_countries_d[a].add(b)

In [13]:
for k in capital_countries_d:
    if len(capital_countries_d[k]) > 1:
        print(k, '-', capital_countries_d[k])

London - {'United Kingdom', 'England'}
Prague - {'Czechoslovakia', 'Czech Republic'}
Luanda - {'Portuguese Angola', 'Angola'}
Washington, D.C. - {'United States', 'District of Columbia'}
Berlin - {'Weimar Republic', 'Germany'}
Naypyidaw - {'Myanmar', 'Burma'}


##### ==> No duplicated capital's names for two different countries

In [14]:
google_maps = GoogleMaps(api_key=KEY)

In [15]:
%%time

locs = {} # (lat, lng)

for capital, country in places:
    if capital not in locs:
        found = False
        try:
            locations = google_maps.search(location=capital + ', ' + country)
            if len(locations.list_data) == 1:
                location = locations.list_data[0]
                locs[capital] = (location.lat, location.lng)
                found = True
            else:
                for location in locations.list_data:
                    if location.country is None or str(location.country, 'utf-8') in capital_countries_d[capital]:
                        locs[capital] = (location.lat, location.lng)
                        found = True
                        break
        except ApiClientException as e:
            pass
        if not found:
            try:
                locations = google_maps.search(location=country + ', ' + capital)
                if len(locations.list_data) == 1:
                    location = locations.list_data[0]
                    locs[capital] = (location.lat, location.lng)
                    found = True
                else:
                    for location in locations.list_data:
                        if location.country is None or str(location.country, 'utf-8') in capital_countries_d[capital]:
                            locs[capital] = (location.lat, location.lng)
                            found = True
                            break
            except ApiClientException as e:
                pass
        if not found:
            try:
                locations = google_maps.search(location=capital)
                if len(locations.list_data) == 1:
                    location = locations.list_data[0]
                    locs[capital] = (location.lat, location.lng)
                    found = True
                else:
                    for location in locations.list_data:
                        if str(location.country, 'utf-8') in capital_countries_d[capital]:
                            locs[capital] = (location.lat, location.lng)
                            found = True
                            break
            except ApiClientException as e:
                pass
        if not found:
            print('NOT FOUND:', capital, '-', country)

NOT FOUND: Monaco - Monaco
NOT FOUND: Brasília - Brazil
NOT FOUND: Brussels - Belgium
NOT FOUND: Tokyo - Japan
NOT FOUND: Hong Kong - Hong Kong
NOT FOUND: Bonaire - Caribbean Netherlands
NOT FOUND: Singapore - Singapore
NOT FOUND: Jakarta - Indonesia
CPU times: user 4.06 s, sys: 186 ms, total: 4.25 s
Wall time: 1min 11s


In [16]:
locs['Bonaire'] = (12.183333, -68.25)
locs['Brasília'] = (-15.7942287, -47.8821658)
locs['Brussels'] = (50.85, 4.35)
locs['Hong Kong'] = (22.3, 114.2)
locs['Jakarta'] = (-8.792258, 99.0179762)
locs['Monaco'] = (43.7400718, 7.426643599999999)
locs['Singapore'] = (1.3147268, 103.7065911)
locs['Tokyo'] = (35.6732615, 139.5699618)

In [17]:
%%time

def process_geolocation_capital_autor(x):
    c = x['CAPITAL_AUTOR']
    return locs.get(c, (0, 0))

autores_df['GEOLOC_CAPITAL_AUTOR'] = autores_df.apply(process_geolocation_capital_autor, axis=1)

CPU times: user 1.91 s, sys: 7.69 ms, total: 1.92 s
Wall time: 1.91 s


In [18]:
%%time

def process_geolocation_capital_ejemplar(x):
    c = x['CAPITAL_EJEMPLAR']
    return locs.get(c, (0, 0))

autores_df['GEOLOC_CAPITAL_EJEMPLAR'] = autores_df.apply(process_geolocation_capital_ejemplar, axis=1)

CPU times: user 1.31 s, sys: 0 ns, total: 1.31 s
Wall time: 1.31 s


# Saving results

In [19]:
autores_df.to_csv('autores8.csv', index=False)