# Maps

In [1]:
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from geolocation.main import GoogleMaps
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

from googlemaps_key import KEY

# Autores

In [2]:
%%time

autores_df = pd.read_csv('autores7.csv', header=0)
autores_df = autores_df.fillna('')

CPU times: user 250 ms, sys: 4.06 ms, total: 254 ms
Wall time: 252 ms


In [3]:
autores_df.shape

(76722, 9)

In [4]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,FECHA_PUB,CAPITAL_AUTOR,CAPITAL_EJEMPLAR,GEOLOC_CAPITAL_AUTOR,GEOLOC_CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,Spain,Martina Badstuber,Germany,2010,Berlin,Madrid,"(52.52000659999999, 13.404954)","(40.4167754, -3.7037902)"
1,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico,2009,Mexico City,Madrid,"(19.4326077, -99.133208)","(40.4167754, -3.7037902)"
2,Humo,Spain,Antón Fortes Torres,Spain,2008,Madrid,Madrid,"(40.4167754, -3.7037902)","(40.4167754, -3.7037902)"
3,Galleta para perros,Spain,Helen Cooper,United Kingdom,2008,London,Madrid,"(51.5073509, -0.1277583)","(40.4167754, -3.7037902)"
4,Autobio,Spain,Cyril Pedrosa,France,2009,Paris,Madrid,"(48.856614, 2.3522219)","(40.4167754, -3.7037902)"


# Places and coordinates

## Places and coordinates dictionary

In [26]:
%%time

place_coord_d = {}

for i, x in autores_df.iterrows():
    place_coord_d[x['PAIS_AUTOR']] = x['GEOLOC_CAPITAL_AUTOR']
    place_coord_d[x['PAIS_EJEMPLAR']]= x['GEOLOC_CAPITAL_EJEMPLAR']

CPU times: user 12 s, sys: 3.7 ms, total: 12 s
Wall time: 12 s


## Coordinates and places dictionary

In [27]:
coord_place_d = defaultdict(list)

for k in place_coord_d:
    coord_place_d[place_coord_d[k]].append(k)

coord_place_d = dict(coord_place_d)

## Places dictionary

In [5]:
paises_autor_c = autores_df['PAIS_AUTOR'].value_counts()
paises_ejemplar_c = autores_df['PAIS_EJEMPLAR'].value_counts()
places_c = paises_autor_c.add(paises_ejemplar_c, fill_value=0).astype(int)

In [45]:
def sort_by_frequency(places):
    places_freqs = [(p, places_c[p]) for p in places]
    sorted_freqs = sorted(places_freqs, key=lambda x: x[1], reverse=True)
    return sorted_freqs

In [48]:
sort_by_frequency(['Iran', 'Qajar dynasty', 'Pahlavi dynasty'])

[('Iran', 64), ('Qajar dynasty', 43), ('Pahlavi dynasty', 1)]

In [31]:
coord_place_d

{'(-0.1806532, -78.4678382)': ['Ecuador'],
 '(-1.2920659, 36.8219462)': ['East Africa Protectorate', 'Kenya'],
 '(-1.9705786, 30.1044288)': ['Rwanda'],
 '(-13.9626121, 33.7741195)': ['Malawi'],
 '(-15.3875259, 28.3228165)': ['Zambia'],
 '(-15.7217509, -48.0084478)': ['Brazil'],
 '(-17.8251657, 31.03351)': ['Rhodesia', 'Zimbabwe'],
 '(-18.1248086, 178.4500789)': ['Fiji'],
 '(-18.8791902, 47.5079055)': ['Madagascar'],
 '(-19.0195852, -65.2619615)': ['Bolivia'],
 '(-20.1608912, 57.5012222)': ['Mauritius'],
 '(-22.2558234, 166.4505243)': ['New Caledonia'],
 '(-22.5608807, 17.0657549)': ['Namibia'],
 '(-25.2637399, -57.57592599999999)': ['Paraguay'],
 '(-25.7478676, 28.2292712)': ['South Africa'],
 '(-25.891968, 32.6051351)': ['Mozambique'],
 '(-26.3054482, 31.1366715)': ['Swaziland'],
 '(-33.4488897, -70.6692655)': ['Chile'],
 '(-34.6036844, -58.3815591)': ['Argentina'],
 '(-34.9011127, -56.16453139999999)': ['Uruguay'],
 '(-35.2809368, 149.1300092)': ['Australia'],
 '(-4.2633597, 15.24288

In [7]:
%%time

coord_places_d = defaultdict(set)

for i, x in autores_df.iterrows():
    pais_autor = x['PAIS_AUTOR']
    pais_ejemplar = x['PAIS_EJEMPLAR']
    geoloc_autor = x['GEOLOC_CAPITAL_AUTOR']
    geoloc_ejemplar = x['GEOLOC_CAPITAL_EJEMPLAR']
    if geoloc_autor == geoloc_ejemplar:
        coord_places_d[geoloc_autor].add(pais_autor)
        coord_places_d[geoloc_autor].add(pais_ejemplar)
    else:
        coord_places_d[geoloc_autor].add(pais_autor)
        coord_places_d[geoloc_ejemplar].add(pais_ejemplar)

coord_places_d = dict(coord_places_d)
for k in coord_places_d:
    coord_places_d[k] = most_frequent(coord_places_d[k])

CPU times: user 12.4 s, sys: 2.82 ms, total: 12.4 s
Wall time: 12.4 s


## Rutas

In [8]:
autores_df['PAIS_AUTOR'].nunique()

236

In [9]:
autores_df['PAIS_AUTOR'].value_counts().head()

United States     12174
Spain              9159
France             7477
United Kingdom     6675
Germany            4664
Name: PAIS_AUTOR, dtype: int64

In [10]:
autores_df['PAIS_EJEMPLAR'].nunique()

121

In [11]:
autores_df['PAIS_EJEMPLAR'].value_counts().head()

Spain            37466
Colombia          9891
Argentina         7289
Mexico            6787
United States     3914
Name: PAIS_EJEMPLAR, dtype: int64

In [12]:
%%time

all_routes = []
for i, x in autores_df.iterrows():
    geoloc_autor = x['GEOLOC_CAPITAL_AUTOR']
    geoloc_ejemplar = x['GEOLOC_CAPITAL_EJEMPLAR']
    all_routes.append((coord_places_d[geoloc_autor], coord_places_d[geoloc_ejemplar]))

CPU times: user 9.37 s, sys: 7.84 ms, total: 9.38 s
Wall time: 9.37 s


In [13]:
cycles = [r for r in all_routes if r[0] == r[1]]
routes = [r for r in all_routes if r[0] != r[1]]

In [14]:
cycles_c = Counter(cycles)
routes_c = Counter(routes)

In [15]:
top_cycles = cycles_c.most_common()
top_routes = routes_c.most_common()

In [16]:
top_cycles

[(('Spain', 'Spain'), 7333),
 (('Colombia', 'Colombia'), 4015),
 (('Argentina', 'Argentina'), 1703),
 (('United States', 'United States'), 1662),
 (('Mexico', 'Mexico'), 1554),
 (('France', 'France'), 1083),
 (('United Kingdom', 'United Kingdom'), 790),
 (('Venezuela', 'Venezuela'), 462),
 (('Chile', 'Chile'), 322),
 (('Brazil', 'Brazil'), 279),
 (('Peru', 'Peru'), 274),
 (('Cuba', 'Cuba'), 226),
 (('Italy', 'Italy'), 207),
 (('Germany', 'Germany'), 145),
 (('Ecuador', 'Ecuador'), 143),
 (('Uruguay', 'Uruguay'), 81),
 (('Portugal', 'Portugal'), 67),
 (('Russia', 'Russia'), 49),
 (('Japan', 'Japan'), 38),
 (('El Salvador', 'El Salvador'), 21),
 (('Bolivia', 'Bolivia'), 20),
 (('Guatemala', 'Guatemala'), 20),
 (('Costa Rica', 'Costa Rica'), 19),
 (('Switzerland', 'Switzerland'), 17),
 (('Dominican Republic', 'Dominican Republic'), 15),
 (('Norway', 'Norway'), 13),
 (('Nicaragua', 'Nicaragua'), 10),
 (('Panama', 'Panama'), 10),
 (('Puerto Rico', 'Puerto Rico'), 8),
 (('Paraguay', 'Paragua

In [17]:
top_routes

[(('United States', 'Spain'), 6200),
 (('United Kingdom', 'Spain'), 3996),
 (('France', 'Spain'), 3869),
 (('Germany', 'Spain'), 2661),
 (('Italy', 'Spain'), 2256),
 (('France', 'Argentina'), 1009),
 (('United States', 'Colombia'), 924),
 (('United States', 'Argentina'), 857),
 (('United States', 'Mexico'), 844),
 (('Argentina', 'Spain'), 783),
 (('Spain', 'Colombia'), 645),
 (('France', 'Mexico'), 590),
 (('Belgium', 'Spain'), 534),
 (('United Kingdom', 'United States'), 521),
 (('Ireland', 'Spain'), 515),
 (('Switzerland', 'Spain'), 515),
 (('United States', 'United Kingdom'), 504),
 (('United States', 'New York (State)'), 494),
 (('Austria', 'Spain'), 477),
 (('Germany', 'Argentina'), 463),
 (('Canada', 'Spain'), 451),
 (('United Kingdom', 'Mexico'), 444),
 (('Spain', 'Mexico'), 442),
 (('United Kingdom', 'Argentina'), 440),
 (('Italy', 'Argentina'), 421),
 (('Poland', 'Spain'), 403),
 (('Mexico', 'Spain'), 398),
 (('Spain', 'Argentina'), 393),
 (('Germany', 'Mexico'), 385),
 (('Uni

# Maps

In [None]:
def get_coords(place):
    

In [21]:
cycles_graph = nx.DiGraph()
routes_graph = nx.DiGraph()

In [22]:
for k in cycles_c:
    if k[0] not in cycles_graph:
        cycles_graph.add_node(k[0], latitude=)
    cycles_graph.add_edge(k[0], k[1], weight=cycles_c[k])

In [23]:
for k in routes_c:
    routes_graph.add_edge(k[0], k[1], weight=routes_c[k])

# Saving results

In [24]:
nx.write_gexf(cycles_graph, 'cycles_graph.gexf')
nx.write_gexf(routes_graph, 'routes_graph.gexf')