In [1]:
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

# Préstamos

In [2]:
%%time

dtypes = {
    'TITULO': str,
    'AUTOR': str,
    'SUCURSAL': str,
    'COD_BARRAS_LIBRO': str,
    'DEWEY': str,
    'DESC_LOCALIZACION_EJEMPLAR': str,
    'FECHA_PUB': str,
    'PAIS_EJEMPLAR': str,
    'OCUPACION': str,
    'NIVEL_EDUCACION': str,
    'AREA_ESTUDIO': str,
    'SUCURSAL_PRESTAMO': str,
    'SUCURSAL_DEVOLUCION': str,
    'SUCURSAL_DEL_LECTOR': str,
    'COD_BARRAS_LECTOR': str,
    'CATEGORÍA': str,
    'FECHA_PRESTAMO': str,
    'FECHA_REAL_DEVOLUCION': str,
    'DATE_PRESTAMO': pd.tslib.Timestamp,
    'DATE_DEVOLUCION': pd.tslib.Timestamp,
}

prestamos_df = pd.read_csv('prestamos_totales.csv', header=0, dtype=dtypes, parse_dates=['DATE_PRESTAMO', 'DATE_DEVOLUCION'])
prestamos_df = prestamos_df.fillna('')

CPU times: user 19.8 s, sys: 697 ms, total: 20.5 s
Wall time: 20.5 s


# Autores

In [3]:
%%time

dtypes = {
    'TITULO': str,
    'AUTOR': str,
    'PAIS_EJEMPLAR': str,
    'PAIS_AUTOR': str,
}

autores_df = pd.read_csv('autores3.csv', header=0, dtype=dtypes)
autores_df = autores_df.fillna('')

CPU times: user 5.03 s, sys: 152 ms, total: 5.18 s
Wall time: 5.18 s


In [4]:
autores_df.shape

(3054360, 4)

In [5]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR
0,Un bicho extraño,Spain,Mon Daporta,DESCONOCIDO
1,Un bicho extraño,Spain,Mon Daporta,DESCONOCIDO
2,Quisiera tener ...,Mexico,Giovanna Zoboli,DESCONOCIDO
3,Johannes Gutenberg,Spain,Lluís Borràs Perelló,DESCONOCIDO
4,La fantástica leyenda de: la princesa y el dragón,Spain,Sonia Alins,DESCONOCIDO


## Limpiando datos

In [6]:
autores_df = autores_df[
    (autores_df['AUTOR'] != 'DESCONOCIDO') &
    (autores_df['TITULO'] != 'DESCONOCIDO') &
    (autores_df['PAIS_AUTOR'] != 'DESCONOCIDO') &
    (autores_df['PAIS_EJEMPLAR'] != 'DESCONOCIDO')
]

In [7]:
autores_df.shape

(1064910, 4)

In [8]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR
10,Adivina en qué pais ...,Spain,Martina Badstuber,Germany
11,Adivina en qué pais ...,Spain,Martina Badstuber,Germany
12,Adivina en qué pais ...,Spain,Martina Badstuber,Germany
14,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico
15,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico


In [9]:
autores_df.drop_duplicates(['AUTOR', 'TITULO'], inplace=True)

In [10]:
autores_df.shape

(76697, 4)

In [11]:
autores_df.head()

Unnamed: 0,TITULO,PAIS_EJEMPLAR,AUTOR,PAIS_AUTOR
10,Adivina en qué pais ...,Spain,Martina Badstuber,Germany
14,¡No quiero hacer pipí en el orinal!,Spain,Roser Rius,Mexico
19,Humo,Spain,Antón Fortes Torres,Spain
21,Galleta para perros,Spain,Helen Cooper,United Kingdom
25,Autobio,Spain,Cyril Pedrosa,France


## Resultados

In [12]:
autores_df['TITULO'].nunique()

73916

In [13]:
autores_df['TITULO'].value_counts()

Obras completas                                                 113
Antología poética                                                79
Antología                                                        71
Poemas                                                           61
Cuentos                                                          52
Cuentos completos                                                49
Poesía                                                           46
Obras escogidas                                                  35
Obras                                                            34
Teatro                                                           33
Obra poética                                                     33
Obras selectas                                                   32
Poesía completa                                                  31
Antología personal                                               26
Poesías completas                               

In [14]:
autores_df['PAIS_EJEMPLAR'].nunique()

130

In [15]:
autores_df['PAIS_EJEMPLAR'].value_counts()

Spain                                 37448
Colombia                               9876
Argentina                              7286
Mexico                                 6792
United States                          3908
France                                 1977
England                                1791
Venezuela                              1134
New York (State)                        855
Chile                                   799
Cuba                                    487
Brazil                                  447
United Kingdom                          439
Peru                                    420
Italy                                   383
Germany                                 294
Ecuador                                 280
California                              156
Uruguay                                 142
Canada                                  138
Switzerland                             114
Austria                                 108
Japan                           

In [16]:
autores_df['AUTOR'].nunique()

22060

In [17]:
autores_df['AUTOR'].value_counts()

William Shakespeare             230
Gabriel García Márquez          177
Diana Uribe                     169
Isaac Asimov                    164
Karl Marx                       160
Jorge Luis Borges               144
Oscar Wilde                     130
Agatha Christie                 129
Friedrich Wilhelm Nietzsche     120
Johann Sebastian Bach           117
Edgar Allan Poe                 116
Octavio Paz                     111
Noam Chomsky                    110
Ludwig van Beethoven            110
Jordi Sierra i Fabra            108
Georges Simenon                 106
Charles Dickens                 105
Stefan Zweig                    105
Pablo Neruda                    103
Jules Verne                     102
Mario Benedetti                 101
Jean Paul Sartre                 96
Miguel de Cervantes Saavedra     96
Arthur Conan Doyle               95
Federico García Lorca            95
Fernando Pessoa                  93
Honoré de Balzac                 93
Sigmund Freud               

In [18]:
autores_df['PAIS_AUTOR'].nunique()

410

In [19]:
autores_df['PAIS_AUTOR'].value_counts()

United States of America                       10910
Spain                                           8153
France                                          7154
United Kingdom                                  6141
Colombia                                        4414
Germany                                         3635
Argentina                                       3354
Italy                                           3207
Mexico                                          2132
Chile                                           1064
Brazil                                          1033
Belgium                                          962
Canada                                           947
United States                                    893
Ireland                                          892
Venezuela                                        867
España                                           856
Cuba                                             810
Switzerland                                   

# Mapping countries

In [20]:
countries = set(autores_df['PAIS_EJEMPLAR']).union(autores_df['PAIS_AUTOR'])

In [21]:
inv_mappings = defaultdict(set)
for a in countries:
    for b in countries:
        if a != b and a in b:
            inv_mappings[a].add(b)
inv_mappings = dict(inv_mappings)

mapped = set()
for s in inv_mappings.values():
    mapped = mapped.union(s)

unmapped = countries.difference(mapped)

In [22]:
inv_mappings

{'Afghanistan': {'Kingdom of Afghanistan'},
 'Albania': {"People's Socialist Republic of Albania"},
 'Austria': {'Austria-Hungary',
  'Austrian Empire',
  'First Austrian Republic',
  'Republic of German-Austria'},
 'Azerbaijan': {'Azerbaijan Soviet Socialist Republic'},
 'Brazil': {'Empire of Brazil'},
 'Bulgaria': {'Kingdom of Bulgaria', "People's Republic of Bulgaria"},
 'Burma': {'British rule in Burma'},
 'China': {'China (Republic : 1949- )',
  "People's Republic of China",
  'Republic of China (1912–49)',
  'República Popular China'},
 'Colombia': {'Estados Unidos de Colombia',
  'Gran Colombia',
  'United States of Colombia'},
 'Croatia': {'Kingdom of Croatia-Slavonia', 'Socialist Republic of Croatia'},
 'Cuba': {'Captaincy General of Cuba', 'Republic of Cuba (1902–59)'},
 'Denmark': {'Kingdom of Denmark'},
 'Dominica': {'Dominican Republic', 'República Dominicana'},
 'Egipto': {'Jedivato de Egipto'},
 'Egypt': {'Khedivate of Egypt', 'Kingdom of Egypt', 'Sultanate of Egypt'},
 

In [23]:
mappings = {}
for c1 in inv_mappings:
    for c2 in inv_mappings[c1]:
        mappings[c2] = c1

In [24]:
mappings

{'Allied-occupied Germany': 'Germany',
 'Austria-Hungary': 'Hungary',
 'Austrian Empire': 'Austria',
 'Azerbaijan Soviet Socialist Republic': 'Azerbaijan',
 'British Kenya': 'Kenya',
 'British rule in Burma': 'Burma',
 'Captaincy General of Cuba': 'Cuba',
 'Captaincy General of Venezuela': 'Venezuela',
 'Caribbean Netherlands': 'Netherlands',
 'China (Republic : 1949- )': 'China',
 'Colonial Nigeria': 'Nigeria',
 'Colony of Virginia': 'Virginia',
 'Congress Poland': 'Poland',
 'Crown Colony of Malta': 'Malta',
 'Democratic Republic of the Congo': 'Republic of the Congo',
 'Dominican Republic': 'Dominica',
 'Dominion of India': 'India',
 'Dominion of New Zealand': 'New Zealand',
 'Dominion of Pakistan': 'Pakistan',
 'Duchy of Württemberg': 'Württemberg',
 'East Germany': 'Germany',
 'East Pakistan': 'Pakistan',
 'Electorate of Württemberg': 'Württemberg',
 'Empire of Brazil': 'Brazil',
 'Empire of Japan': 'Japan',
 'Equatorial Guinea': 'Guinea',
 'Estados Unidos de Colombia': 'Colombia'

In [25]:
# Correct wrong mappings

del(mappings['Dominican Republic'])
mappings['Estados Unidos de Colombia'] = 'Colombia'
del(mappings['France'])
mappings['Francia'] = 'France'
mappings['Francoist Spain'] = 'Spain'
mappings['German military administration in occupied France during World War II'] = 'France'
del(mappings['Indiana'])
mappings['Jedivato de Egipto'] = 'Egypt'
mappings['Kingdom of France'] = 'France'
del(mappings['New Jersey'])
mappings['New Spain'] = 'Mexico'
mappings['North German Confederation'] = 'Germany'
del(mappings['Northern Ireland'])
mappings['Provincias Unidas de los Países Bajos'] = 'Netherlands'
mappings['Reino Unido de Gran Bretaña e Irlanda'] = 'United Kingdom'
mappings['República Dominicana'] = 'Dominican Republic'
mappings['República Socialista Federativa Soviética de Rusia'] = 'Russia'
del(mappings['Saint Kitts and Nevis'])
mappings['United Kingdom of Great Britain and Ireland'] = 'United Kingdom'
mappings['United States of America'] = 'United States'
mappings['United States of Colombia'] = 'Colombia'
del(mappings['United States of the Ionian Islands'])
mappings['Unión entre Suecia y Noruega'] = 'Sweden'
mappings['Vichy France'] = 'France'

In [26]:
unmapped

{'Afghanistan',
 'Alabama',
 'Albania',
 'Alemania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua Atenas',
 'Antigua Roma',
 'Antigua and Barbuda',
 'Arabia Saudita',
 'Argentina',
 'Arizona',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Azerbaiyán',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Basque Country (greater region)',
 'Belarus',
 'Belgian Congo',
 'Belgium',
 'Benin',
 'Bielorrusia',
 'Bolivia',
 'Bourbon Restoration',
 'Brasil',
 'Brazil',
 'British America',
 'British Ceylon',
 'British Empire',
 'British Raj',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Bélgica',
 'California',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Canadá',
 'Canal Zone',
 'Cape Verde',
 'Chile',
 'China',
 'Colombia',
 'Colorado',
 'Congo Belga',
 'Connecticut',
 'Costa Rica',
 'Croacia',
 'Croatia',
 'Crown of Aragon',
 'Crown of Castile',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czech Republic',
 'Czechoslovakia',
 "Côte d'Ivoire",
 'Denmark',
 'Dinamarca',
 'District of Columbia',
 'Dominica',
 'Dominion of Newfou

In [27]:
# Correct unmapped

mappings['Alemania'] = 'Germany'
mappings['Antigua Atenas'] = 'Greek'
mappings['Antigua Roma'] = 'Italy'
mappings['Arabia Saudita'] = 'Saudi Arabia'
mappings['Azerbaiyán'] = 'Azerbaijan'
mappings['Basque Country (greater region)'] = 'Spain'
mappings['Brasil'] = 'Brazil'
mappings['British Empire'] = 'United Kingdom'
mappings['Bélgica'] = 'Belgium'
mappings['Canadá'] = 'Canada'
mappings['Congo Belga'] = 'Belgian Congo'
mappings['Croacia'] = 'Croatia'
mappings['Crown of Aragon'] = 'Spain'
mappings['Crown of Castile'] = 'Spain'
mappings['Dinamarca'] = 'Denmark'
mappings['First Czechoslovak Republic'] = 'Czechoslovakia'
mappings['First French Empire'] = 'France'
mappings['First Hungarian Republic'] = 'Hungary'
mappings['First Portuguese Republic'] = 'Portugal'
mappings['Free and Independent State of Cundinamarca'] = 'Colombia'
mappings['French First Republic'] = 'France'
mappings['French Third Republic'] = 'France'
mappings['French colonial empire'] = 'France'
mappings['German Confederation'] = 'Germany'
mappings['German Empire'] = 'Germany'
mappings['Grecia'] = 'Greece'
mappings['Haití'] = 'Haiti'
mappings['Holy Roman Empire'] = 'Italy'
mappings['Hungría'] = 'Hungary'
mappings['Imperio Otomano'] = 'Ottoman Empire'
mappings['Imperio alemán'] = 'Germany'
mappings['Imperio austrohúngaro'] = 'Austria'
mappings['Imperio otomano'] = 'Ottoman Empire'
mappings['Imperio romano'] = 'Italy'
mappings['Imperio ruso'] = 'Russia'
mappings['Irak'] = 'Iraq'
mappings['Irlanda'] = 'Ireland'
mappings['Irán'] = 'Iran'
mappings['Japón'] = 'Japan'
mappings['Kingdom of Great Britain'] = 'United Kingdom'
mappings['Kingdom of Yugoslavia'] = 'Yugoslavia'
mappings['Korea (South)'] = 'South Korea'
mappings['Korea, Republic of'] = 'South Korea'
mappings['Libia'] = 'Libya'
mappings['Lituania'] = 'Lithuania'
mappings['Líbano'] = 'Lebanon'
mappings['Mandato británico de Palestina'] = 'Palestine'
mappings['Mandatory Palestine'] = 'Palestine'
mappings['Marruecos'] = 'Morocco'
mappings['México'] = 'Mexico'
mappings['Noruega'] = 'Norway'
mappings['Palestinian National Authority'] = 'Palestine'
mappings['Palestinian Territory, Occupied'] = 'Palestine'
mappings['Panamá'] = 'Panama'
mappings['Países Bajos'] = 'Netherlands'
mappings['Perú'] = 'Peru'
mappings['Polonia'] = 'Poland'
mappings['Reino Unido'] = 'United Kingdom'
mappings['Reino de Inglaterra'] = 'England'
mappings['Reino de Prusia'] = 'Prussia'
mappings['Reino de Yugoslavia'] = 'Yugoslavia'
mappings['República Checa'] = 'Czech Republic'
mappings['República romana'] = 'Italy'
mappings['Rumania'] = 'Romania'
mappings['Rusia'] = 'Russia'
mappings['Sacro Imperio Romano Germánico'] = 'Germany'
mappings['Second French Empire'] = 'France'
mappings['Second Hellenic Republic'] = 'Greece'
mappings['Second Polish Republic'] = 'Poland'
mappings['Socialist Federal Republic of Yugoslavia'] = 'Yugoslavia'
mappings['Soviet Union'] = 'Russia'
mappings['Spanish Empire'] = 'Spain'
mappings['Sudáfrica'] = 'South Africa'
mappings['Suecia'] = 'Sweden'
mappings['Suiza'] = 'Switzerland'
mappings['Tajik Soviet Socialist Republic'] = 'Tajikistan'
mappings['Territorios Palestinos'] = 'Palestine'
mappings["Ukrainian People's Republic"] = 'Ukraine'
mappings['Ukrainian Soviet Socialist Republic'] = 'Ukraine'
mappings['Unión Soviética'] = 'Russia'
mappings['Uzbekistán'] = 'Uzbekistan'
mappings['Virreinato de Nueva Granada'] = 'Viceroyalty of New Granada'

## Correcting countries

In [28]:
%%time

def process_pais_ejemplar(x):
    c = x['PAIS_EJEMPLAR']
    return mappings.get(c, c)

autores_df['PAIS_EJEMPLAR'] = autores_df.apply(process_pais_ejemplar, axis=1)

CPU times: user 2.25 s, sys: 12.1 ms, total: 2.26 s
Wall time: 2.26 s


In [29]:
%%time

def process_pais_autor(x):
    c = x['PAIS_AUTOR']
    return mappings.get(c, c)

autores_df['PAIS_AUTOR'] = autores_df.apply(process_pais_autor, axis=1)

CPU times: user 2.28 s, sys: 0 ns, total: 2.28 s
Wall time: 2.28 s


In [30]:
autores_df['PAIS_EJEMPLAR'].nunique()

121

In [31]:
autores_df['PAIS_AUTOR'].nunique()

245

# Saving results

In [32]:
autores_df.to_csv('autores4.csv', index=False)