In [11]:
from scipy.stats import entropy
from numpy.linalg import norm
import numpy as np
from scipy.spatial.distance import jensenshannon
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
import geopandas as gpd 
%matplotlib inline

In [12]:
dict_ine_name= {'02': 'Albacete',
 '03': 'Alicante/Alacant',
 '04': 'Almería',
 '01': 'Araba/Álava',
 '33': 'Asturias',
 '05': 'Ávila',
 '06': 'Badajoz',
 '07': 'Balears Illes',
 '08': 'Barcelona',
 '48': 'Bizkaia',
 '09': 'Burgos',
 '10': 'Cáceres',
 '11': 'Cádiz',
 '39': 'Cantabria',
 '12': 'Castellón/Castelló',
 '13': 'Ciudad Real',
 '14': 'Córdoba',
 '15': 'Coruña A',
 '16': 'Cuenca',
 '20': 'Gipuzkoa',
 '17': 'Girona',
 '18': 'Granada',
 '19': 'Guadalajara',
 '21': 'Huelva',
 '22': 'Huesca',
 '23': 'Jaén',
 '24': 'León',
 '25': 'Lleida',
 '27': 'Lugo',
 '28': 'Madrid',
 '29': 'Málaga',
 '30': 'Murcia',
 '31': 'Navarra',
 '32': 'Ourense',
 '34': 'Palencia',
 '35': 'Palmas Las',
 '36': 'Pontevedra',
 '26': 'Rioja La',
 '37': 'Salamanca',
 '38': 'Santa Cruz de Tenerife',
 '40': 'Segovia',
 '41': 'Sevilla',
 '42': 'Soria',
 '43': 'Tarragona',
 '44': 'Teruel',
 '45': 'Toledo',
 '46': 'Valencia/València',
 '47': 'Valladolid',
 '49': 'Zamora',
 '50': 'Zaragoza',
 '51': 'Ceuta',
 '52': 'Melilla'}


In [22]:
def preprocess(path):
    df = pd.read_csv(path, delimiter=",", encoding = "utf-8")
    df['Viajeros'] = df['Viajeros'].str.replace(r'\,','').astype(float)
    df['Viajeros-km'] = df['Viajeros-km'].str.replace(r'\,','').astype(float)
    df['Destino2'] = df['Destino'].apply(lambda z: z[:2])
    return df

In [23]:
def preprocess2(path):
    col = ['Origen','Destino', 'Año', 'Mes', 'Día', 'Periodo', 'Modo', 'Distancia', 'Residencia', 'Actividad_Origen',
           'Actividad_Destino', 'Viajeros', 'Viajeros-km']
    df = pd.read_csv(path, delimiter=",", names = col)
    df['Viajeros'] = df['Viajeros'].apply(lambda z: float(z.replace(',', '.')))
    df['Viajeros-km'] = df['Viajeros-km'].apply(lambda z: float(z.replace(',', '.')))
    df['Destino2'] = df['Destino'].apply(lambda z: z[:2])
    return df

In [24]:
def get_destino_distribution(df,Origen):
    destino_distribution = df[df.Origen == Origen].groupby('Destino2').Viajeros.sum().to_frame('weight')
    destino_distribution.reset_index(inplace=True)
    df_all_destino = pd.DataFrame([str(z).zfill(2) for z in range(1, 53)], columns=['Destino2'])
    destino_distribution = destino_distribution.merge(df_all_destino, on='Destino2', how='outer')
    destino_distribution.weight.fillna(0, inplace=True)
    if destino_distribution['weight'].sum() == 0:
        destino_distribution['norm_weight'] = 1 / len(destino_distribution)
    else:
        destino_distribution['norm_weight'] = destino_distribution['weight'] / destino_distribution['weight'].sum()
    destino_distribution.sort_values('norm_weight', ascending=False, inplace=True)
    return destino_distribution

In [25]:
def plot_signiture(df):
    plt.figure()
    rank = list(range(1, 53))
    destino_distributions = {}
    destino_norm_distributions = {}
    for i in range(1, 53):
        destino = str(i).zfill(2)
        destino_distributions[destino] = get_destino_distribution(df,Origen=destino)
        destino_norm_distributions[destino] = destino_distributions[destino].norm_weight

    plt.subplot(1, 1, 1)
    for i in range(1, 53):
        destino_distribution = destino_distributions[str(i).zfill(2)]
        plt.plot(rank, destino_distribution.norm_weight, color='blue', alpha=0.5)
    # plt.yscale('log')
    plt.xlabel('Rank')
    plt.ylabel('Fraction')
    plt.tight_layout()


In [26]:
def get_distance_matrix(df):
    destino_distributions = {}
    destino_norm_distributions = {}
    for i in range(1, 53):
        destino = str(i).zfill(2)
        destino_distributions[destino] = get_destino_distribution(df, Origen=destino)
        destino_norm_distributions[destino] = destino_distributions[destino].norm_weight

    
    dist_mat = []
    destino_dist_avg = {}
    for i in range(1, 53):
        destino_i = str(i).zfill(2)
        array_i = []
        for j in range(1, 53):
            destino_j = str(j).zfill(2)
            array_i.append(jensenshannon(destino_norm_distributions[destino_i], 
                                         destino_norm_distributions[destino_j], 
                                         base=2))
        dist_mat.append(array_i)
        array_i = np.array(array_i)
        destino_dist_avg[destino_i] = np.nanmean(array_i)
    return dist_mat

In [27]:
def plot_distance_matrix(dist_mat):
    plt.figure(figsize=(14, 14))
    plt.imshow(dist_mat, cmap='Spectral')
    plt.colorbar()
    plt.xticks(range(52), [dict_ine_name[str(i).zfill(2)] for i in range(1, 53)], rotation=90)
    plt.yticks(range(52), [dict_ine_name[str(i).zfill(2)] for i in range(1, 53)])
    plt.title('distance matrix')

In [28]:
def plot_dendrogram(dist_mat):
    plt.figure(figsize=(12, 6))
    x = linkage(dist_mat,'complete')
    # x = linkage(vs, method='ward')
    g = dendrogram(x, labels=[dict_ine_name[str(i).zfill(2)] for i in range(1, 53)], leaf_rotation=90, leaf_font_size =14)
    plt.tight_layout()
    plt.savefig('dendrogram.pdf')

In [31]:
path = "July/ET_J00.csv"
df = preprocess(path)
plot_signiture(df)
dist_mat = get_distance_matrix(df)
plot_distance_matrix(dist_mat)
plot_dendrogram(dist_mat)

ParserError: Error tokenizing data. C error: Expected 13 fields in line 40542, saw 14


In [None]:
path = "October/ET_O05.csv"
df = preprocess(path)
plot_signiture(df)
dist_mat = get_distance_matrix(df)
plot_distance_matrix(dist_mat)
plot_dendrogram(dist_mat)

In [None]:
gdf = gpd.read_file('Map/Provinces.shp', encoding='utf-8') 

In [None]:
def plot_clustered_cities(group):
    ax = gdf.plot(facecolor='white', edgecolor='black', figsize=(12, 10))
    gdf[gdf.CPRO.isin(group)].plot(ax=ax, color='red')
    

In [None]:
group_name = ['Madrid', 'Ciudad Real', 'León', 'Soria', 'Zamora', 'Burgos', 'Navarra']
group = ["28","13", "24","42","49","09","31"]
plot_clustered_cities(group)

In [None]:
g2_name = ['Ceuta', 'Balears Illes']
g2 = ["51", "07"]
plot_clustered_cities(g2)

In [None]:
g3 = ["50", "17","12"]
plot_clustered_cities(g3)