# Maps

In [1]:
import ast
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from geolocation.main import GoogleMaps
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

from googlemaps_key import KEY

# Autores

In [2]:
%%time

autores_df = pd.read_csv(
    'autores7.csv',
    header=0,
    converters={
        'GEOLOC_CAPITAL_AUTOR': ast.literal_eval,
        'GEOLOC_CAPITAL_EJEMPLAR': ast.literal_eval
    }
)
autores_df = autores_df.fillna('')

CPU times: user 2.52 s, sys: 36.2 ms, total: 2.56 s
Wall time: 2.56 s


In [3]:
autores_df.shape

(76722, 9)

In [4]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,FECHA_PUB,CAPITAL_AUTOR,CAPITAL_EJEMPLAR,GEOLOC_CAPITAL_AUTOR,GEOLOC_CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,Spain,Martina Badstuber,Germany,2010,Berlin,Madrid,"(52.52000659999999, 13.404954)","(40.4167754, -3.7037902)"
1,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico,2009,Mexico City,Madrid,"(19.4326077, -99.133208)","(40.4167754, -3.7037902)"
2,Humo,Spain,Antón Fortes Torres,Spain,2008,Madrid,Madrid,"(40.4167754, -3.7037902)","(40.4167754, -3.7037902)"
3,Galleta para perros,Spain,Helen Cooper,United Kingdom,2008,London,Madrid,"(51.5073509, -0.1277583)","(40.4167754, -3.7037902)"
4,Autobio,Spain,Cyril Pedrosa,France,2009,Paris,Madrid,"(48.856614, 2.3522219)","(40.4167754, -3.7037902)"


# Places and coordinates

## Cleaning 1

In [5]:
autores_df = autores_df[
    (autores_df['PAIS_AUTOR'] != 'No place, unknown, or undetermined') &
    (autores_df['PAIS_EJEMPLAR'] != 'No place, unknown, or undetermined')
]

## Places and coordinates dictionary

In [6]:
%%time

place_coord_d = {}

for i, x in autores_df.iterrows():
    place_coord_d[x['PAIS_AUTOR']] = x['GEOLOC_CAPITAL_AUTOR']
    place_coord_d[x['PAIS_EJEMPLAR']]= x['GEOLOC_CAPITAL_EJEMPLAR']

CPU times: user 13.1 s, sys: 11.2 ms, total: 13.1 s
Wall time: 13.1 s


## Coordinates and places dictionary 1

In [7]:
coord_places_d = defaultdict(list)

for k in place_coord_d:
    coord_places_d[place_coord_d[k]].append(k)

coord_places_d = dict(coord_places_d)

## Places dictionary

In [8]:
paises_autor_c = autores_df['PAIS_AUTOR'].value_counts()
paises_ejemplar_c = autores_df['PAIS_EJEMPLAR'].value_counts()
places_c = paises_autor_c.add(paises_ejemplar_c, fill_value=0).astype(int)

In [9]:
def sort_by_frequency(places):
    places_freqs = [(p, places_c[p]) for p in places]
    sorted_freqs = sorted(places_freqs, key=lambda x: x[1], reverse=True)
    sorted_places = [p for p, f in sorted_freqs]
    return sorted_places

In [10]:
place_place_d = {}

for k in coord_places_d:
    sorted_places = sort_by_frequency(coord_places_d[k])
    for p in sorted_places:
        place_place_d[p] = sorted_places[0]

## Coordinates and places dictionary 2

In [11]:
coord_place_d = {}

for k in coord_places_d:
    coord_place_d[k] = place_place_d[coord_places_d[k][0]]

## Cleaning 2

In [12]:
%%time

def process_pais_autor(x):
    return place_place_d[x['PAIS_AUTOR']]

autores_df['PAIS_AUTOR'] = autores_df.apply(process_pais_autor, axis=1)

CPU times: user 2.18 s, sys: 3.75 ms, total: 2.19 s
Wall time: 2.19 s


In [13]:
%%time

def process_pais_ejemplar(x):
    return place_place_d[x['PAIS_EJEMPLAR']]

autores_df['PAIS_EJEMPLAR'] = autores_df.apply(process_pais_ejemplar, axis=1)

CPU times: user 2.23 s, sys: 4.03 ms, total: 2.24 s
Wall time: 2.23 s


# Rutas

In [14]:
autores_df['PAIS_AUTOR'].nunique()

199

In [15]:
autores_df['PAIS_AUTOR'].value_counts().head()

United States     12171
Spain              9157
France             7578
United Kingdom     6940
Germany            4840
Name: PAIS_AUTOR, dtype: int64

In [16]:
autores_df['PAIS_EJEMPLAR'].nunique()

117

In [17]:
autores_df['PAIS_EJEMPLAR'].value_counts().head()

Spain            37466
Colombia          9891
Argentina         7289
Mexico            6787
United States     3935
Name: PAIS_EJEMPLAR, dtype: int64

In [18]:
%%time

all_routes = []
for i, x in autores_df.iterrows():
    geoloc_autor = x['GEOLOC_CAPITAL_AUTOR']
    geoloc_ejemplar = x['GEOLOC_CAPITAL_EJEMPLAR']
    all_routes.append((coord_place_d[geoloc_autor], coord_place_d[geoloc_ejemplar]))

CPU times: user 10.5 s, sys: 15.8 ms, total: 10.5 s
Wall time: 10.5 s


In [19]:
cycles = [r for r in all_routes if r[0] == r[1]]
routes = [r for r in all_routes if r[0] != r[1]]

In [20]:
cycles_c = Counter(cycles)
routes_c = Counter(routes)

In [21]:
top_cycles = cycles_c.most_common()
top_routes = routes_c.most_common()

In [22]:
top_cycles

[(('Spain', 'Spain'), 7333),
 (('Colombia', 'Colombia'), 4015),
 (('Argentina', 'Argentina'), 1703),
 (('United States', 'United States'), 1662),
 (('Mexico', 'Mexico'), 1554),
 (('France', 'France'), 1083),
 (('United Kingdom', 'United Kingdom'), 790),
 (('Venezuela', 'Venezuela'), 462),
 (('Chile', 'Chile'), 322),
 (('Brazil', 'Brazil'), 279),
 (('Peru', 'Peru'), 274),
 (('Cuba', 'Cuba'), 226),
 (('Italy', 'Italy'), 207),
 (('Germany', 'Germany'), 145),
 (('Ecuador', 'Ecuador'), 143),
 (('Uruguay', 'Uruguay'), 81),
 (('Portugal', 'Portugal'), 67),
 (('Russia', 'Russia'), 49),
 (('Japan', 'Japan'), 38),
 (('El Salvador', 'El Salvador'), 21),
 (('Bolivia', 'Bolivia'), 20),
 (('Guatemala', 'Guatemala'), 20),
 (('Costa Rica', 'Costa Rica'), 19),
 (('Switzerland', 'Switzerland'), 17),
 (('Dominican Republic', 'Dominican Republic'), 15),
 (('Norway', 'Norway'), 13),
 (('Nicaragua', 'Nicaragua'), 10),
 (('Panama', 'Panama'), 10),
 (('Puerto Rico', 'Puerto Rico'), 8),
 (('Paraguay', 'Paragua

In [23]:
top_routes

[(('United States', 'Spain'), 6200),
 (('United Kingdom', 'Spain'), 3996),
 (('France', 'Spain'), 3869),
 (('Germany', 'Spain'), 2661),
 (('Italy', 'Spain'), 2256),
 (('France', 'Argentina'), 1009),
 (('United States', 'Colombia'), 924),
 (('United States', 'Argentina'), 857),
 (('United States', 'Mexico'), 844),
 (('Argentina', 'Spain'), 783),
 (('Spain', 'Colombia'), 645),
 (('France', 'Mexico'), 590),
 (('Belgium', 'Spain'), 534),
 (('United Kingdom', 'United States'), 521),
 (('Switzerland', 'Spain'), 515),
 (('Ireland', 'Spain'), 515),
 (('United States', 'United Kingdom'), 504),
 (('United States', 'New York (State)'), 494),
 (('Austria', 'Spain'), 477),
 (('Germany', 'Argentina'), 463),
 (('Canada', 'Spain'), 451),
 (('United Kingdom', 'Mexico'), 444),
 (('Spain', 'Mexico'), 442),
 (('United Kingdom', 'Argentina'), 440),
 (('Italy', 'Argentina'), 421),
 (('Poland', 'Spain'), 403),
 (('Mexico', 'Spain'), 398),
 (('Spain', 'Argentina'), 393),
 (('Germany', 'Mexico'), 385),
 (('Uni

# Maps

In [24]:
def get_coords(place):
    place = place_place_d[place]
    coord = place_coord_d[place]
    return coord

In [25]:
cycles_graph = nx.DiGraph()
routes_graph = nx.DiGraph()

In [26]:
for k in cycles_c:
    if k[0] not in cycles_graph:
        lat, lng = get_coords(k[0])
        cycles_graph.add_node(k[0], latitude=lat, longitude=lng, weight=cycles_c[k])
    if k[1] not in cycles_graph:
        lat, lng = get_coords(k[1])
        cycles_graph.add_node(k[1], latitude=lat, longitude=lng, weight=cycles_c[k])
    cycles_graph.add_edge(k[0], k[1], weight=cycles_c[k])

In [27]:
for k in routes_c:
    if k[0] not in routes_graph:
        lat, lng = get_coords(k[0])
        routes_graph.add_node(k[0], latitude=lat, longitude=lng, weight=routes_c[k])
    if k[1] not in routes_graph:
        lat, lng = get_coords(k[1])
        routes_graph.add_node(k[1], latitude=lat, longitude=lng, weight=routes_c[k])
    routes_graph.add_edge(k[0], k[1], weight=routes_c[k])

# Saving results

In [28]:
nx.write_gexf(cycles_graph, 'cycles_graph.gexf')
nx.write_gexf(routes_graph, 'routes_graph.gexf')