# Maps

In [1]:
import ast
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from geolocation.main import GoogleMaps
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

from googlemaps_key import KEY

# Autores

In [2]:
%%time

autores_df = pd.read_csv(
    'autores8.csv',
    header=0,
    converters={
        'GEOLOC_CAPITAL_AUTOR': ast.literal_eval,
        'GEOLOC_CAPITAL_EJEMPLAR': ast.literal_eval
    }
)
autores_df = autores_df.fillna('')

CPU times: user 2.13 s, sys: 21.9 ms, total: 2.15 s
Wall time: 2.15 s


In [3]:
autores_df.shape

(61857, 10)

In [4]:
autores_df.head()

Unnamed: 0,TITULO,FECHA_PUB,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR,NOMBRE_LOCALIZADO,CAPITAL_AUTOR,CAPITAL_EJEMPLAR,GEOLOC_CAPITAL_AUTOR,GEOLOC_CAPITAL_EJEMPLAR
0,Adivina en qué pais ...,2010,Spain,Martina Badstuber,Germany,Martina Badstuber,Berlin,Madrid,"(52.52000659999999, 13.404954)","(40.4167754, -3.7037902)"
1,Galleta para perros,2008,Spain,Helen Cooper,United Kingdom,Helen Cooper,London,Madrid,"(51.5073509, -0.1277583)","(40.4167754, -3.7037902)"
2,Codrilo,2009,Spain,Roberto Aliaga Sánchez,Spain,Roberto Aliaga,Madrid,Madrid,"(40.4167754, -3.7037902)","(40.4167754, -3.7037902)"
3,"Félix, : el coleccionista de miedos",2009,Spain,Fina Casalderrey,Spain,Fina Casalderrey,Madrid,Madrid,"(40.4167754, -3.7037902)","(40.4167754, -3.7037902)"
4,En la laguna más profunda,2011,Colombia,Oscar Collazos,Colombia,Oscar Collazos,Bogotá,Bogotá,"(4.710988599999999, -74.072092)","(4.710988599999999, -74.072092)"


# Places and coordinates

## Places and coordinates dictionary

In [5]:
%%time

place_coord_d = {}

for i, x in autores_df.iterrows():
    place_coord_d[x['PAIS_AUTOR']] = x['GEOLOC_CAPITAL_AUTOR']
    place_coord_d[x['PAIS_EJEMPLAR']]= x['GEOLOC_CAPITAL_EJEMPLAR']

CPU times: user 10.4 s, sys: 7.06 ms, total: 10.4 s
Wall time: 10.4 s


## Coordinates and places dictionary 1

In [6]:
coord_places_d = defaultdict(list)

for k in place_coord_d:
    coord_places_d[place_coord_d[k]].append(k)

coord_places_d = dict(coord_places_d)

## Places dictionary

In [7]:
paises_autor_c = autores_df['PAIS_AUTOR'].value_counts()
paises_ejemplar_c = autores_df['PAIS_EJEMPLAR'].value_counts()
places_c = paises_autor_c.add(paises_ejemplar_c, fill_value=0).astype(int)

In [8]:
def sort_by_frequency(places):
    places_freqs = [(p, places_c[p]) for p in places]
    sorted_freqs = sorted(places_freqs, key=lambda x: x[1], reverse=True)
    sorted_places = [p for p, f in sorted_freqs]
    return sorted_places

In [9]:
place_place_d = {}

for k in coord_places_d:
    sorted_places = sort_by_frequency(coord_places_d[k])
    for p in sorted_places:
        place_place_d[p] = sorted_places[0]

## Coordinates and places dictionary 2

In [10]:
coord_place_d = {}

for k in coord_places_d:
    coord_place_d[k] = place_place_d[coord_places_d[k][0]]

## Cleaning

In [11]:
%%time

def process_pais_autor(x):
    return place_place_d[x['PAIS_AUTOR']]

autores_df['PAIS_AUTOR'] = autores_df.apply(process_pais_autor, axis=1)

CPU times: user 1.94 s, sys: 11.9 ms, total: 1.95 s
Wall time: 1.95 s


In [12]:
%%time

def process_pais_ejemplar(x):
    return place_place_d[x['PAIS_EJEMPLAR']]

autores_df['PAIS_EJEMPLAR'] = autores_df.apply(process_pais_ejemplar, axis=1)

CPU times: user 1.71 s, sys: 4.03 ms, total: 1.71 s
Wall time: 1.71 s


# Resultados

In [13]:
autores_df['TITULO'].nunique()

50872

In [14]:
autores_df['TITULO'].value_counts()

Obras completas                                                 146
Antología poética                                               111
Cuentos                                                          93
Antología                                                        72
Poemas                                                           69
Cuentos completos                                                63
Poesía                                                           57
Poesía completa                                                  50
Obras escogidas                                                  41
Teatro                                                           41
Obra poética                                                     37
Obras                                                            37
Poesías completas                                                36
María                                                            34
Poesías                                         

In [15]:
autores_df['PAIS_EJEMPLAR'].nunique()

107

In [16]:
autores_df['PAIS_EJEMPLAR'].value_counts()

Spain                32224
Colombia              8009
Argentina             6227
Mexico                5390
United States         2382
United Kingdom        1497
Francia               1336
Venezuela              804
Chile                  668
New York (State)       537
Cuba                   467
Brazil                 366
Peru                   286
Italy                  255
Ecuador                194
Germany                146
Uruguay                119
Portugal                93
Russia                  76
Costa Rica              53
Japan                   50
California              50
China                   48
Massachusetts           40
Puerto Rico             39
Switzerland             35
Canada                  34
El Salvador             29
Bolivia                 24
Nicaragua               22
                     ...  
Poland                   2
Honduras                 2
Ireland                  2
Minnesota                2
Maine                    1
Egypt                    1
N

In [17]:
autores_df['NOMBRE_LOCALIZADO'].nunique()

11440

In [18]:
autores_df['NOMBRE_LOCALIZADO'].value_counts()

William Shakespeare               381
Gabriel García Márquez            367
Jules Verne                       220
Jorge Luis Borges                 216
Miguel de Cervantes               213
Isaac Asimov                      206
Oscar Wilde                       194
Agatha Christie                   178
Julio Cortázar                    176
Charles Dickens                   170
Franz Kafka                       169
Pablo Neruda                      167
Mario Vargas Llosa                162
Edgar Allan Poe                   154
Friedrich Nietzsche               153
Stefan Zweig                      149
Octavio Paz                       147
Robert Louis Stevenson            141
Mario Benedetti                   134
Federico García Lorca             133
Leonard Huxley                    132
Hans Christian Andersen           132
Jean Paul                         131
Honoré de Balzac                  129
Hermann Hesse                     123
Aristóteles                       122
Arthur Conan

In [19]:
autores_df['PAIS_AUTOR'].nunique()

146

In [20]:
autores_df['PAIS_AUTOR'].value_counts()

United States            9525
Spain                    7869
United Kingdom           6818
Francia                  5988
Colombia                 3450
Germany                  3327
Italy                    2774
Argentina                2707
Mexico                   1943
Cuba                     1027
Chile                     923
Austria                   910
Belgium                   910
Peru                      897
Ireland                   839
Russia                    777
Brazil                    749
Uruguay                   732
India                     632
Poland                    612
Switzerland               516
Canada                    511
Venezuela                 475
Ukraine                   437
Czech Republic            399
Japan                     362
Portugal                  355
Greece                    318
Netherlands               299
Algeria                   269
                         ... 
Taiwan                      3
Trinidad and Tobago         3
Guyana    

# Rutas

In [21]:
autores_df['PAIS_AUTOR'].nunique()

146

In [22]:
autores_df['PAIS_AUTOR'].value_counts().head()

United States     9525
Spain             7869
United Kingdom    6818
Francia           5988
Colombia          3450
Name: PAIS_AUTOR, dtype: int64

In [23]:
autores_df['PAIS_EJEMPLAR'].nunique()

107

In [24]:
autores_df['PAIS_EJEMPLAR'].value_counts().head()

Spain            32224
Colombia          8009
Argentina         6227
Mexico            5390
United States     2382
Name: PAIS_EJEMPLAR, dtype: int64

In [25]:
%%time

all_routes = []
for i, x in autores_df.iterrows():
    geoloc_autor = x['GEOLOC_CAPITAL_AUTOR']
    geoloc_ejemplar = x['GEOLOC_CAPITAL_EJEMPLAR']
    all_routes.append((coord_place_d[geoloc_autor], coord_place_d[geoloc_ejemplar]))

CPU times: user 8.89 s, sys: 27.6 ms, total: 8.92 s
Wall time: 8.91 s


In [26]:
cycles = [r for r in all_routes if r[0] == r[1]]
routes = [r for r in all_routes if r[0] != r[1]]

In [27]:
cycles_c = Counter(cycles)
routes_c = Counter(routes)

In [28]:
len(cycles_c)

43

In [29]:
len(routes_c)

1208

In [30]:
top_cycles = cycles_c.most_common()
top_routes = routes_c.most_common()

In [31]:
top_cycles

[(('Spain', 'Spain'), 6420),
 (('Colombia', 'Colombia'), 2883),
 (('Mexico', 'Mexico'), 1364),
 (('Argentina', 'Argentina'), 1346),
 (('United States', 'United States'), 949),
 (('Francia', 'Francia'), 766),
 (('United Kingdom', 'United Kingdom'), 685),
 (('Venezuela', 'Venezuela'), 280),
 (('Chile', 'Chile'), 247),
 (('Brazil', 'Brazil'), 239),
 (('Cuba', 'Cuba'), 223),
 (('Peru', 'Peru'), 199),
 (('Italy', 'Italy'), 145),
 (('Ecuador', 'Ecuador'), 99),
 (('Uruguay', 'Uruguay'), 79),
 (('Russia', 'Russia'), 65),
 (('Portugal', 'Portugal'), 64),
 (('Germany', 'Germany'), 55),
 (('Japan', 'Japan'), 29),
 (('Costa Rica', 'Costa Rica'), 25),
 (('Bolivia', 'Bolivia'), 20),
 (('El Salvador', 'El Salvador'), 13),
 (('Dominican Republic', 'Dominican Republic'), 10),
 (('Nicaragua', 'Nicaragua'), 10),
 (('Panama', 'Panama'), 7),
 (('Switzerland', 'Switzerland'), 5),
 (('Guatemala', 'Guatemala'), 5),
 (('Paraguay', 'Paraguay'), 4),
 (('China', 'China'), 3),
 (('Canada', 'Canada'), 3),
 (('Czech

In [32]:
top_routes

[(('United States', 'Spain'), 5228),
 (('United Kingdom', 'Spain'), 3912),
 (('Francia', 'Spain'), 3166),
 (('Germany', 'Spain'), 2009),
 (('Italy', 'Spain'), 1723),
 (('United States', 'Colombia'), 900),
 (('United States', 'Argentina'), 829),
 (('Francia', 'Argentina'), 785),
 (('Argentina', 'Spain'), 701),
 (('United States', 'Mexico'), 639),
 (('Austria', 'Spain'), 563),
 (('Spain', 'Colombia'), 538),
 (('Belgium', 'Spain'), 516),
 (('United Kingdom', 'Colombia'), 496),
 (('United Kingdom', 'Argentina'), 491),
 (('Francia', 'Mexico'), 482),
 (('United Kingdom', 'Mexico'), 481),
 (('United Kingdom', 'United States'), 475),
 (('Russia', 'Spain'), 448),
 (('Ireland', 'Spain'), 439),
 (('Poland', 'Spain'), 406),
 (('Francia', 'Colombia'), 389),
 (('Germany', 'Argentina'), 388),
 (('Cuba', 'Spain'), 387),
 (('Spain', 'Argentina'), 347),
 (('Argentina', 'Colombia'), 342),
 (('Switzerland', 'Spain'), 338),
 (('India', 'Spain'), 336),
 (('Spain', 'Mexico'), 319),
 (('United States', 'New Y

# Maps

In [33]:
def get_coords(place):
    place = place_place_d[place]
    coord = place_coord_d[place]
    return coord

In [34]:
cycles_graph = nx.DiGraph()
routes_graph = nx.DiGraph()

In [35]:
for k in cycles_c:
    if k[0] not in cycles_graph:
        lat, lng = get_coords(k[0])
        cycles_graph.add_node(k[0], latitude=lat, longitude=lng, weight=cycles_c[k])
    cycles_graph.add_edge(k[0], k[0], weight=cycles_c[k])

In [36]:
for k in routes_c:
    if k[0] not in routes_graph:
        lat, lng = get_coords(k[0])
        routes_graph.add_node(k[0], latitude=lat, longitude=lng, weight=0)
    old_weight = routes_graph.node[k[0]]['weight']
    new_weight = old_weight + routes_c[k]
    routes_graph.node[k[0]]['weight'] = new_weight
    if k[1] not in routes_graph:
        lat, lng = get_coords(k[1])
        routes_graph.add_node(k[1], latitude=lat, longitude=lng, weight=0)
    old_weight = routes_graph.node[k[1]]['weight']
    new_weight = old_weight + routes_c[k]
    routes_graph.node[k[1]]['weight'] = new_weight
    routes_graph.add_edge(k[0], k[1], weight=routes_c[k])

# Filtering labels

In [37]:
def select_top_nodes(g, p):
    ns = [(n, d['weight']) for n, d in g.nodes_iter(data=True)]
    ns = sorted(ns, key=lambda x: x[1], reverse=True)
    amount = round(len(ns) * p)
    taken = ns[:amount]
    labels = [n for n, w in taken]
    return labels

In [38]:
for i in range(10, 100, 10):
    p = i / 100
    top_cycles_labels = select_top_nodes(cycles_graph, p)
    for k in cycles_graph:
        if k in top_cycles_labels:
            cycles_graph.node[k]['show_label_' + str(i)] = k
        else:
            cycles_graph.node[k]['show_label_' + str(i)] = ''

In [39]:
for i in range(10, 100, 10):
    p = i / 100
    top_routes_labels = select_top_nodes(routes_graph, p)
    for k in routes_graph:
        if k in top_routes_labels:
            routes_graph.node[k]['show_label_' + str(i)] = k
        else:
            routes_graph.node[k]['show_label_' + str(i)] = ''

# Saving results

In [40]:
nx.write_gexf(cycles_graph, 'gephi/cycles_graph.gexf')
nx.write_gexf(routes_graph, 'gephi/routes_graph.gexf')