# Geolocation

In [1]:
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from geolocation.exceptions import ApiClientException
from geolocation.main import GoogleMaps
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

from googlemaps_key import KEY

# Autores

In [2]:
%%time

autores_df = pd.read_csv('autores6.csv', header=0)
autores_df = autores_df.fillna('')

CPU times: user 208 ms, sys: 12 ms, total: 220 ms
Wall time: 219 ms


In [3]:
autores_df.shape

(76722, 7)

In [4]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,FECHA_PUB,CAPITAL_AUTOR,CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,Spain,Martina Badstuber,Germany,2010,Berlin,Madrid
1,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico,2009,Mexico City,Madrid
2,Humo,Spain,Antón Fortes Torres,Spain,2008,Madrid,Madrid
3,Galleta para perros,Spain,Helen Cooper,United Kingdom,2008,London,Madrid
4,Autobio,Spain,Cyril Pedrosa,France,2009,Paris,Madrid


## Limpiando datos

In [5]:
autores_df = autores_df[
    (autores_df['CAPITAL_AUTOR'] != 'DESCONOCIDO') &
    (autores_df['CAPITAL_EJEMPLAR'] != 'DESCONOCIDO')
]

In [6]:
autores_df.shape

(76698, 7)

In [7]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,FECHA_PUB,CAPITAL_AUTOR,CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,Spain,Martina Badstuber,Germany,2010,Berlin,Madrid
1,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico,2009,Mexico City,Madrid
2,Humo,Spain,Antón Fortes Torres,Spain,2008,Madrid,Madrid
3,Galleta para perros,Spain,Helen Cooper,United Kingdom,2008,London,Madrid
4,Autobio,Spain,Cyril Pedrosa,France,2009,Paris,Madrid


# Geocoordinates

In [8]:
autores_df['CAPITAL_EJEMPLAR'].nunique()

116

In [9]:
autores_df['CAPITAL_AUTOR'].nunique()

203

In [10]:
%%time

places = set()
for i, x in autores_df.iterrows():
    pais_ejemplar = x['PAIS_EJEMPLAR']
    capital_ejemplar = x['CAPITAL_EJEMPLAR']
    places.add((capital_ejemplar, pais_ejemplar))
    pais_autor = x['PAIS_AUTOR']
    capital_autor = x['CAPITAL_AUTOR']
    places.add((capital_autor, pais_autor))

CPU times: user 10.9 s, sys: 4 ms, total: 10.9 s
Wall time: 10.9 s


In [11]:
len(places)

281

In [12]:
capital_countries_d = defaultdict(set)
for a, b in places:
    capital_countries_d[a].add(b)

In [13]:
for k in capital_countries_d:
    if len(capital_countries_d[k]) > 1:
        print(k, '-', capital_countries_d[k])

Washington, D.C. - {'District of Columbia', 'United States'}
Minsk - {'Bielorrusia', 'Belarus'}
Athens - {"Côte d'Ivoire", 'Greek', 'Greece'}
Prague - {'Czech Republic', 'Kingdom of Bohemia', 'Czechoslovakia', 'Protectorate of Bohemia and Moravia'}
Harare - {'Zimbabwe', 'Rhodesia'}
Budapest - {'Lands of the Crown of Saint Stephen', 'Hungary'}
Tehran - {'Pahlavi dynasty', 'Qajar dynasty', 'Iran'}
Ankara - {'Turquía', 'Turkey'}
Accra - {'Ghana', 'Gold Coast (British colony)'}
Belgrade - {'Yugoslavia', 'Serbia'}
Munich - {'Electorate of Bavaria', 'Kingdom of Bavaria'}
Rome - {'Italy', 'Papal States'}
Milan - {'Duchy of Milan', 'Kingdom of Lombardy–Venetia'}
Dresden - {'Electorate of Saxony', 'Kingdom of Saxony'}
Vienna - {'Austria', 'Habsburg Monarchy'}
Berlin - {'Weimar Republic', 'Germany'}
Florence - {'Grand Duchy of Tuscany', 'Republic of Florence'}
Naypyidaw - {'Myanmar', 'Burma'}
Paris - {'France', 'Bourbon Restoration'}
Manila - {'Philippines', 'Filipinas'}
Luanda - {'Portuguese An

##### ==> No duplicated capital's names for two different countries

In [14]:
google_maps = GoogleMaps(api_key=KEY)

In [15]:
%%time

locs = {} # (lat, lng)

for capital, country in places:
    if capital not in locs:
        found = False
        try:
            locations = google_maps.search(location=capital + ', ' + country)
            if len(locations.list_data) == 1:
                location = locations.list_data[0]
                locs[capital] = (location.lat, location.lng)
                found = True
            if not found:
                for location in locations.list_data:
                    if location.country is None or str(location.country, 'utf-8') in capital_countries_d[capital]:
                        locs[capital] = (location.lat, location.lng)
                        found = True
                        break
        except ApiClientException as e:
            pass
        if not found:
            try:
                locations = google_maps.search(location=capital)
                if len(locations.list_data) == 1:
                    location = locations.list_data[0]
                    locs[capital] = (location.lat, location.lng)
                    found = True
                if not found:
                    for location in locations.list_data:
                        if str(location.country, 'utf-8') in capital_countries_d[capital]:
                            locs[capital] = (location.lat, location.lng)
                            found = True
                            break
            except ApiClientException as e:
                pass
        if not found:
            print('NOT FOUND:', capital, '-', country)

NOT FOUND: Vatican City - Vatican City
NOT FOUND: Jakarta - Indonesia
NOT FOUND: Singapore - Singapore
NOT FOUND: Kingdom of England - British America
NOT FOUND: Persépolis - Imperio aqueménida
NOT FOUND: Hong Kong - Hong Kong
NOT FOUND: Tagsatzung - Old Swiss Confederacy
NOT FOUND: Brasília - Brazil
NOT FOUND: Brunswick - Duchy of Brunswick
NOT FOUND: Gibraltar - Gibraltar
NOT FOUND: Monaco - Monaco
NOT FOUND: Batavia, Dutch East Indies - Dutch East Indies
NOT FOUND: Tokyo - Japan
CPU times: user 9.79 s, sys: 440 ms, total: 10.2 s
Wall time: 3min 16s


In [16]:
locs['Tokyo'] = (35.6732615, 139.5699618)
locs['Jakarta'] = (-8.792258, 99.0179762)
locs['Gibraltar'] = (36.1407734, -5.353599399999999)
locs['Singapore'] = (1.3147268, 103.7065911)
locs['Vatican City'] = (41.9021788, 12.4536007)
locs['Batavia, Dutch East Indies'] = (-8.792258, 99.0179762)
locs['Brasília'] = (-15.7942287, -47.8821658)
locs['Monaco'] = (43.7400718, 7.426643599999999)
locs['Persépolis'] = (29.934444, 52.891389)
locs['Kingdom of England'] = (51.5073509, -0.1277583)
locs['Tagsatzung'] = (47.3774336, 8.466504)
locs['Brunswick'] = (52.266667, 10.516667)
locs['Hong Kong'] = (22.3, 114.2)

In [17]:
%%time

def process_geolocation_capital_autor(x):
    c = x['CAPITAL_AUTOR']
    return locs.get(c, (0, 0))

autores_df['GEOLOC_CAPITAL_AUTOR'] = autores_df.apply(process_geolocation_capital_autor, axis=1)

CPU times: user 1.83 s, sys: 4 ms, total: 1.84 s
Wall time: 1.86 s


In [18]:
%%time

def process_geolocation_capital_ejemplar(x):
    c = x['CAPITAL_EJEMPLAR']
    return locs.get(c, (0, 0))

autores_df['GEOLOC_CAPITAL_EJEMPLAR'] = autores_df.apply(process_geolocation_capital_ejemplar, axis=1)

CPU times: user 1.82 s, sys: 4 ms, total: 1.82 s
Wall time: 1.82 s


# Saving results

In [19]:
autores_df.to_csv('autores7.csv', index=False)