# Maps

In [1]:
import ast
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from geolocation.main import GoogleMaps
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

from googlemaps_key import KEY

# Autores

In [2]:
%%time

autores_df = pd.read_csv(
    'autores7.csv',
    header=0,
    converters={
        'GEOLOC_CAPITAL_AUTOR': ast.literal_eval,
        'GEOLOC_CAPITAL_EJEMPLAR': ast.literal_eval
    }
)
autores_df = autores_df.fillna('')

CPU times: user 4.49 s, sys: 25.3 ms, total: 4.51 s
Wall time: 4.51 s


In [3]:
autores_df.shape

(87200, 9)

In [4]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,FECHA_PUB,CAPITAL_AUTOR,CAPITAL_EJEMPLAR,GEOLOC_CAPITAL_AUTOR,GEOLOC_CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,Spain,Martina Badstuber,Germany,2010,Berlin,Madrid,"(52.52000659999999, 13.404954)","(40.4167754, -3.7037902)"
1,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico,2009,Mexico City,Madrid,"(19.4326077, -99.133208)","(40.4167754, -3.7037902)"
2,Humo,Spain,Antón Fortes Torres,Spain,2008,Madrid,Madrid,"(40.4167754, -3.7037902)","(40.4167754, -3.7037902)"
3,Galleta para perros,Spain,Helen Cooper,United Kingdom,2008,London,Madrid,"(51.5073509, -0.1277583)","(40.4167754, -3.7037902)"
4,Autobio,Spain,Cyril Pedrosa,France,2009,Paris,Madrid,"(48.856614, 2.3522219)","(40.4167754, -3.7037902)"


# Places and coordinates

## Cleaning 1

In [5]:
autores_df = autores_df[
    (autores_df['PAIS_AUTOR'] != 'No place, unknown, or undetermined') &
    (autores_df['PAIS_EJEMPLAR'] != 'No place, unknown, or undetermined')
]

## Places and coordinates dictionary

In [6]:
%%time

place_coord_d = {}

for i, x in autores_df.iterrows():
    place_coord_d[x['PAIS_AUTOR']] = x['GEOLOC_CAPITAL_AUTOR']
    place_coord_d[x['PAIS_EJEMPLAR']]= x['GEOLOC_CAPITAL_EJEMPLAR']

CPU times: user 21.2 s, sys: 0 ns, total: 21.2 s
Wall time: 21.2 s


## Coordinates and places dictionary 1

In [7]:
coord_places_d = defaultdict(list)

for k in place_coord_d:
    coord_places_d[place_coord_d[k]].append(k)

coord_places_d = dict(coord_places_d)

## Places dictionary

In [8]:
paises_autor_c = autores_df['PAIS_AUTOR'].value_counts()
paises_ejemplar_c = autores_df['PAIS_EJEMPLAR'].value_counts()
places_c = paises_autor_c.add(paises_ejemplar_c, fill_value=0).astype(int)

In [9]:
def sort_by_frequency(places):
    places_freqs = [(p, places_c[p]) for p in places]
    sorted_freqs = sorted(places_freqs, key=lambda x: x[1], reverse=True)
    sorted_places = [p for p, f in sorted_freqs]
    return sorted_places

In [10]:
place_place_d = {}

for k in coord_places_d:
    sorted_places = sort_by_frequency(coord_places_d[k])
    for p in sorted_places:
        place_place_d[p] = sorted_places[0]

## Coordinates and places dictionary 2

In [11]:
coord_place_d = {}

for k in coord_places_d:
    coord_place_d[k] = place_place_d[coord_places_d[k][0]]

## Cleaning 2

In [12]:
%%time

def process_pais_autor(x):
    return place_place_d[x['PAIS_AUTOR']]

autores_df['PAIS_AUTOR'] = autores_df.apply(process_pais_autor, axis=1)

CPU times: user 2.54 s, sys: 12 ms, total: 2.55 s
Wall time: 2.55 s


In [13]:
%%time

def process_pais_ejemplar(x):
    return place_place_d[x['PAIS_EJEMPLAR']]

autores_df['PAIS_EJEMPLAR'] = autores_df.apply(process_pais_ejemplar, axis=1)

CPU times: user 2.5 s, sys: 8.08 ms, total: 2.51 s
Wall time: 2.51 s


# Resultados

In [14]:
autores_df['TITULO'].nunique()

73920

In [15]:
autores_df['TITULO'].value_counts()

Obras completas                                                 170
Antología poética                                               118
Cuentos                                                          91
Antología                                                        83
Poemas                                                           70
Cuentos completos                                                64
Poesía                                                           57
Poesía completa                                                  50
Obra poética                                                     45
Obras escogidas                                                  43
Teatro                                                           41
Obras                                                            37
Poesías completas                                                35
María                                                            34
Obras selectas                                  

In [16]:
autores_df['PAIS_EJEMPLAR'].nunique()

118

In [17]:
autores_df['PAIS_EJEMPLAR'].value_counts()

Spain               42246
Colombia            12237
Argentina            8428
Mexico               7879
United States        4161
United Kingdom       2466
France               2092
Venezuela            1259
Chile                 900
New York (State)      884
Cuba                  578
Peru                  459
Brazil                457
Italy                 419
Germany               360
Ecuador               305
California            160
Uruguay               153
Canada                139
Japan                 116
Switzerland           114
Austria               110
Portugal              104
Russia                101
Netherlands            86
Costa Rica             83
Massachusetts          69
China                  62
Puerto Rico            56
El Salvador            55
                    ...  
Scotland                2
Jan Mayen               2
Curaçao                 2
Ireland                 2
Wisconsin               2
Uganda                  1
Sierra Leone            1
Malaysia    

In [18]:
autores_df['AUTOR'].nunique()

21997

In [19]:
autores_df['AUTOR'].value_counts()

William Shakespeare             381
Gabriel García Márquez          367
Jules Verne                     220
Karl Marx                       217
Jorge Luis Borges               216
Miguel de Cervantes Saavedra    213
Isaac Asimov                    206
Oscar Wilde                     194
Agatha Christie                 178
Julio Cortázar                  176
Charles Dickens                 170
Franz Kafka                     169
Diana Uribe                     169
Pablo Neruda                    167
Mario Vargas Llosa              162
Edgar Allan Poe                 154
Friedrich Wilhelm Nietzsche     153
Stefan Zweig                    149
Octavio Paz                     147
Robert Louis Stevenson          141
Mario Benedetti                 134
Federico García Lorca           133
Hans Christian Andersen         132
Aldous Leonard Huxley           132
Platón                          130
Honoré de Balzac                129
Jean Paul Sartre                129
Sigmund Freud               

In [20]:
autores_df['PAIS_AUTOR'].nunique()

201

In [21]:
autores_df['PAIS_AUTOR'].value_counts()

United States                       13473
Spain                               10138
France                               8560
United Kingdom                       8084
Colombia                             5712
Germany                              5392
Italy                                4444
Argentina                            3667
Mexico                               2588
Austria                              1579
Chile                                1253
Belgium                              1212
Brazil                               1161
Ireland                              1099
Cuba                                 1097
Canada                               1084
Peru                                 1044
Switzerland                          1019
Venezuela                             961
Russia                                885
Poland                                728
Uruguay                               672
Greece                                620
Netherlands                       

# Rutas

In [22]:
autores_df['PAIS_AUTOR'].nunique()

201

In [23]:
autores_df['PAIS_AUTOR'].value_counts().head()

United States     13473
Spain             10138
France             8560
United Kingdom     8084
Colombia           5712
Name: PAIS_AUTOR, dtype: int64

In [24]:
autores_df['PAIS_EJEMPLAR'].nunique()

118

In [25]:
autores_df['PAIS_EJEMPLAR'].value_counts().head()

Spain            42246
Colombia         12237
Argentina         8428
Mexico            7879
United States     4161
Name: PAIS_EJEMPLAR, dtype: int64

In [26]:
%%time

all_routes = []
for i, x in autores_df.iterrows():
    geoloc_autor = x['GEOLOC_CAPITAL_AUTOR']
    geoloc_ejemplar = x['GEOLOC_CAPITAL_EJEMPLAR']
    all_routes.append((coord_place_d[geoloc_autor], coord_place_d[geoloc_ejemplar]))

CPU times: user 12.2 s, sys: 7.99 ms, total: 12.2 s
Wall time: 12.2 s


In [27]:
cycles = [r for r in all_routes if r[0] == r[1]]
routes = [r for r in all_routes if r[0] != r[1]]

In [28]:
cycles_c = Counter(cycles)
routes_c = Counter(routes)

In [29]:
len(cycles_c)

47

In [30]:
len(routes_c)

1747

In [31]:
top_cycles = cycles_c.most_common()
top_routes = routes_c.most_common()

In [32]:
top_cycles

[(('Spain', 'Spain'), 8001),
 (('Colombia', 'Colombia'), 4897),
 (('Argentina', 'Argentina'), 1824),
 (('United States', 'United States'), 1737),
 (('Mexico', 'Mexico'), 1679),
 (('France', 'France'), 1129),
 (('United Kingdom', 'United Kingdom'), 910),
 (('Venezuela', 'Venezuela'), 486),
 (('Chile', 'Chile'), 340),
 (('Peru', 'Peru'), 301),
 (('Brazil', 'Brazil'), 285),
 (('Cuba', 'Cuba'), 264),
 (('Italy', 'Italy'), 219),
 (('Ecuador', 'Ecuador'), 154),
 (('Germany', 'Germany'), 145),
 (('Uruguay', 'Uruguay'), 85),
 (('Portugal', 'Portugal'), 67),
 (('Russia', 'Russia'), 59),
 (('Japan', 'Japan'), 39),
 (('El Salvador', 'El Salvador'), 24),
 (('Costa Rica', 'Costa Rica'), 20),
 (('Guatemala', 'Guatemala'), 20),
 (('Bolivia', 'Bolivia'), 20),
 (('Switzerland', 'Switzerland'), 17),
 (('Dominican Republic', 'Dominican Republic'), 15),
 (('Nicaragua', 'Nicaragua'), 13),
 (('Norway', 'Norway'), 13),
 (('Panama', 'Panama'), 11),
 (('Puerto Rico', 'Puerto Rico'), 8),
 (('Paraguay', 'Paragua

In [33]:
top_routes

[(('United States', 'Spain'), 6872),
 (('United Kingdom', 'Spain'), 4575),
 (('France', 'Spain'), 4367),
 (('Germany', 'Spain'), 2936),
 (('Italy', 'Spain'), 2536),
 (('France', 'Argentina'), 1133),
 (('United States', 'Colombia'), 1115),
 (('United States', 'Argentina'), 988),
 (('United States', 'Mexico'), 978),
 (('Argentina', 'Spain'), 889),
 (('Austria', 'Spain'), 859),
 (('Spain', 'Colombia'), 763),
 (('France', 'Mexico'), 710),
 (('Belgium', 'Spain'), 644),
 (('Ireland', 'Spain'), 591),
 (('Switzerland', 'Spain'), 588),
 (('United Kingdom', 'United States'), 580),
 (('Germany', 'Argentina'), 553),
 (('United Kingdom', 'Colombia'), 545),
 (('United Kingdom', 'Mexico'), 544),
 (('United States', 'United Kingdom'), 540),
 (('United Kingdom', 'Argentina'), 525),
 (('Spain', 'Mexico'), 518),
 (('Canada', 'Spain'), 515),
 (('United States', 'New York (State)'), 509),
 (('France', 'Colombia'), 500),
 (('Italy', 'Argentina'), 485),
 (('Spain', 'Argentina'), 474),
 (('Germany', 'Mexico')

# Maps

In [34]:
def get_coords(place):
    place = place_place_d[place]
    coord = place_coord_d[place]
    return coord

In [35]:
cycles_graph = nx.DiGraph()
routes_graph = nx.DiGraph()

In [36]:
for k in cycles_c:
    if k[0] not in cycles_graph:
        lat, lng = get_coords(k[0])
        cycles_graph.add_node(k[0], latitude=lat, longitude=lng, weight=cycles_c[k])
    cycles_graph.add_edge(k[0], k[0], weight=cycles_c[k])

In [37]:
for k in routes_c:
    if k[0] not in routes_graph:
        lat, lng = get_coords(k[0])
        routes_graph.add_node(k[0], latitude=lat, longitude=lng, weight=0)
    old_weight = routes_graph.node[k[0]]['weight']
    new_weight = old_weight + routes_c[k]
    routes_graph.node[k[0]]['weight'] = new_weight
    if k[1] not in routes_graph:
        lat, lng = get_coords(k[1])
        routes_graph.add_node(k[1], latitude=lat, longitude=lng, weight=0)
    old_weight = routes_graph.node[k[1]]['weight']
    new_weight = old_weight + routes_c[k]
    routes_graph.node[k[1]]['weight'] = new_weight
    routes_graph.add_edge(k[0], k[1], weight=routes_c[k])

# Filtering labels

In [38]:
def select_top_nodes(g, p):
    ns = [(n, d['weight']) for n, d in g.nodes_iter(data=True)]
    ns = sorted(ns, key=lambda x: x[1], reverse=True)
    amount = round(len(ns) * p)
    taken = ns[:amount]
    labels = [n for n, w in taken]
    return labels

In [39]:
for i in range(10, 100, 10):
    p = i / 100
    top_cycles_labels = select_top_nodes(cycles_graph, p)
    for k in cycles_graph:
        if k in top_cycles_labels:
            cycles_graph.node[k]['show_label_' + str(i)] = k
        else:
            cycles_graph.node[k]['show_label_' + str(i)] = ''

In [40]:
for i in range(10, 100, 10):
    p = i / 100
    top_routes_labels = select_top_nodes(routes_graph, p)
    for k in routes_graph:
        if k in top_routes_labels:
            routes_graph.node[k]['show_label_' + str(i)] = k
        else:
            routes_graph.node[k]['show_label_' + str(i)] = ''

# Saving results

In [41]:
nx.write_gexf(cycles_graph, 'cycles_graph.gexf')
nx.write_gexf(routes_graph, 'routes_graph.gexf')