# Usuarios y títulos

In [1]:
import csv
import os
import re
from collections import defaultdict
from collections import Counter
from datetime import datetime
from datetime import timedelta
from string import punctuation

import matplotlib
import numpy as np
import networkx as nx
import pandas as pd
import requests
import seaborn as sns
from matplotlib import pyplot as plt
from SPARQLWrapper import SPARQLWrapper, JSON

# Préstamos

In [2]:
%%time

dtypes = {
    'TITULO': str,
    'AUTOR': str,
    'SUCURSAL': str,
    'COD_BARRAS_LIBRO': str,
    'DEWEY': str,
    'DESC_LOCALIZACION_EJEMPLAR': str,
    'FECHA_PUB': str,
    'PAIS_EJEMPLAR': str,
    'OCUPACION': str,
    'NIVEL_EDUCACION': str,
    'AREA_ESTUDIO': str,
    'SUCURSAL_PRESTAMO': str,
    'SUCURSAL_DEVOLUCION': str,
    'SUCURSAL_DEL_LECTOR': str,
    'COD_BARRAS_LECTOR': str,
    'CATEGORÍA': str,
    'FECHA_PRESTAMO': str,
    'FECHA_REAL_DEVOLUCION': str,
    'DATE_PRESTAMO': pd.tslib.Timestamp,
    'DATE_DEVOLUCION': pd.tslib.Timestamp,
}

prestamos_df = pd.read_csv('prestamos_totales.csv', header=0, dtype=dtypes, parse_dates=['DATE_PRESTAMO', 'DATE_DEVOLUCION'])
prestamos_df = prestamos_df.fillna('')

CPU times: user 19.2 s, sys: 702 ms, total: 19.9 s
Wall time: 19.9 s


In [3]:
prestamos_df.shape

(3054360, 20)

In [4]:
prestamos_df.head()

Unnamed: 0,TITULO,AUTOR,SUCURSAL,COD_BARRAS_LIBRO,DEWEY,DESC_LOCALIZACION_EJEMPLAR,FECHA_PUB,PAIS_EJEMPLAR,OCUPACION,NIVEL_EDUCACION,AREA_ESTUDIO,SUCURSAL_PRESTAMO,SUCURSAL_DEVOLUCION,SUCURSAL_DEL_LECTOR,COD_BARRAS_LECTOR,CATEGORÍA,FECHA_PRESTAMO,FECHA_REAL_DEVOLUCION,DATE_PRESTAMO,DATE_DEVOLUCION
0,Un bicho extraño,"Daporta, Mon.",ARM,2298061,863,Infantil,2010,sp,,,,ARM,ARM,ARM,88429527,CTA,22/01/2015,04/02/2015,2015-01-22,2015-02-04
1,Un bicho extraño,"Daporta, Mon.",ARM,2298061,863,Infantil,2010,sp,,,,ARM,ARM,ARM,88416381,CTA,31/03/2015,07/04/2015,2015-03-31,2015-04-07
2,Quisiera tener ...,"Zoboli, Giovanna.",ARM,2284458,853,Infantil,2010,mx,,,,ARM,ARM,ARM,88381353,CTA,07/11/2015,17/11/2015,2015-11-07,2015-11-17
3,Johannes Gutenberg,"Borràs Perelló, Lluís.",ARM,2288922,925,Infantil,2010,sp,Empleado,Técnico,Adm. de empresas,ARM,ARM,ARM,88007424,CEM,16/01/2015,16/01/2015,2015-01-16,2015-01-16
4,La fantástica leyenda de: la princesa y el dragón,"Alins, Sonia.",ARM,2288896,863,Infantil,2010,sp,,,,ARM,ARM,ARM,88429529,CTA,21/01/2015,28/01/2015,2015-01-21,2015-01-28


# Country codes

## LOC/MARC

http://www.loc.gov/marc/countries/countries_code.html

In [5]:
loc_marc_df = pd.read_csv('codes-loc-marc-countries.csv', sep='\t', names=['code','country'])
loc_marc_sr = pd.Series(data=list(loc_marc_df['country']), index=list(loc_marc_df['code']))

In [6]:
loc_marc_sr.head()

aa                          Albania
abc                         Alberta
-ac     Ashmore and Cartier Islands
aca    Australian Capital Territory
ae                          Algeria
dtype: object

## ISO

http://www.nationsonline.org/oneworld/country_code_list.htm

In [7]:
iso_df = pd.read_csv('codes-iso-countries.csv', sep='\t', names=['country', 'iso-2', 'iso-3', 'code'])
iso2_sr = pd.Series(data=list(iso_df['country']), index=map(lambda x: str(x).lower(), iso_df['iso-2']))
iso3_sr = pd.Series(data=list(iso_df['country']), index=map(lambda x: str(x).lower(), iso_df['iso-3']))

In [8]:
iso2_sr.head()

af       Afghanistan
ax     Aland Islands
al           Albania
dz           Algeria
as    American Samoa
dtype: object

In [9]:
iso3_sr.head()

afg       Afghanistan
ala     Aland Islands
alb           Albania
dza           Algeria
asm    American Samoa
dtype: object

## Mine

In [10]:
missing_codes_dict = {
    'uk': 'United Kingdom',
    'en': 'England',
    'spa': 'Spain',
    'CK': 'Colombia',
    'eng': 'England',
    'ger': 'Germany',
    'uy|': 'Uruguay',
    'SP': 'Spain',
    'AG': 'Argentina',
    'por': 'Portugal',
    'jap': 'Japan',
    'ur': 'Uruguay',
    'fl': 'Florida',
    'flo': 'Florida',
    'fre': 'French Guiana',
    'he': 'Heard Island and Mcdonald Islands',
    'ka': 'Kansas',
    'lou': 'Louisiana',
    'pu': 'Puerto Rico',
    'wa': 'Washington (State)',
}

## País ejemplar

In [11]:
prestamos_df['PAIS_EJEMPLAR'].value_counts()

sp     1194374
ck      968489
mx      366393
ag      180189
???      72351
us       46255
ve       28224
enk      19247
cl       17621
nyu      15367
fr       12416
gx       10072
cau       9985
pe        9473
ec        9341
bl        8413
en        7996
cu        7249
it        6253
gw        5767
uk        5555
xxu       3088
cc        2974
mdu       2566
uy        2397
cr        2313
spa       2235
          1850
ur        1794
mau       1623
        ...   
fs           1
usr          1
pol          1
ka           1
sbe          1
hi           1
bbo          1
mxs          1
nus          1
ly           1
aq           1
quc          1
sjp          1
ev           1
as           1
rp           1
ae           1
utu          1
mtu          1
usm          1
jo           1
ckd          1
sus          1
bh           1
zz           1
tr           1
hn           1
wa           1
nt           1
spy          1
Name: PAIS_EJEMPLAR, dtype: int64

In [12]:
total_countries = set(list(loc_marc_sr.values) + list(iso2_sr.values) + list(iso3_sr.values))

In [13]:
%%time

def process_pais(x):
    a = x['PAIS_EJEMPLAR'].strip()
    if a in loc_marc_sr:
        r = loc_marc_sr[a]
    elif '-' + a in loc_marc_sr:
        r = loc_marc_sr['-' + a]
    elif a in iso3_sr:
        r = iso3_sr[a]
    elif a in iso2_sr:
        r = iso2_sr[a]
    elif a in missing_codes_dict:
        r = missing_codes_dict[a]
    else:
        r = 'DESCONOCIDO'
    return r

prestamos_df['PAIS_EJEMPLAR'] = prestamos_df.apply(process_pais, axis=1)

CPU times: user 7min 54s, sys: 945 ms, total: 7min 55s
Wall time: 7min 54s


In [14]:
prestamos_df['PAIS_EJEMPLAR'].value_counts()

Spain                                          1196618
Colombia                                        968719
Mexico                                          366627
Argentina                                       180773
DESCONOCIDO                                      86413
United States                                    49631
Venezuela                                        28224
England                                          27375
Chile                                            17621
New York (State)                                 15367
France                                           12416
California                                        9985
Peru                                              9473
Ecuador                                           9359
Brazil                                            8413
Cuba                                              7249
United Kingdom                                    6916
Italy                                             6338
Germany   

# Normalize authors

## Resultados finales

In [15]:
prestamos_df.to_csv('prestamos_paises_limpios.csv', index=False)