## Basic EDA for first incoming data

In [439]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [440]:
df_communities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [441]:
df_communities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8132 entries, 0 to 8131
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CPRO       8132 non-null   int64 
 1   PROVINCIA  8132 non-null   object
 2   CMUN       8132 non-null   int64 
 3   NOMBRE     8132 non-null   object
 4   POB24      8132 non-null   int64 
 5   HOMBRES    8132 non-null   int64 
 6   MUJERES    8132 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 444.8+ KB


In [442]:
df_coordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8112 entries, 0 to 8111
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Comunidad   8112 non-null   object 
 1   Provincia   8112 non-null   object 
 2   Población   8112 non-null   object 
 3   Latitud     8112 non-null   float64
 4   Longitud    8112 non-null   float64
 5   Altitud     8112 non-null   float64
 6   Habitantes  8112 non-null   int64  
 7   Hombres     8112 non-null   int64  
 8   Mujeres     8112 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 570.5+ KB


In [443]:
df_coordinates["Comunidad"].unique()

# We need to change:
# Catalunya -> Cataluña
# Ceuta y Melilla --> Ceuta
#                     Melilla


array(['Andalucía', 'Aragón', 'Asturias', 'Canarias', 'Cantabria',
       'Castilla La Mancha', 'Castilla León', 'Catalunya',
       'Ceuta y Melilla', 'Extremadura', 'Galicia', 'Islas Baleares',
       'La Rioja', 'Madrid', 'Murcia', 'Navarra', 'País Vasco',
       'Valencia'], dtype=object)

In [444]:
df_coordinates["Provincia"].unique()

# We need to change:
# 'Alicante/Alacant' --> Alicante
# 'Castellón/Castelló' -> Castellón
# 'Valencia/València' --> Valencia
# Guipúzcoa --> Gipuzkoa
# Vizcaya --> Bizkaia


array(['Almería', 'Cádiz', 'Córdoba', 'Granada', 'Huelva', 'Jaén',
       'Málaga', 'Sevilla', 'Huesca', 'Teruel', 'Zaragoza', 'Asturias',
       'Las Palmas', 'Santa Cruz de Tenerife', 'Cantabria', 'Albacete',
       'Ciudad Real', 'Cuenca', 'Guadalajara', 'Toledo', 'Ávila',
       'Burgos', 'León', 'Palencia', 'Salamanca', 'Segovia', 'Soria',
       'Valladolid', 'Zamora', 'Barcelona', 'Girona', 'Lleida',
       'Tarragona', 'Ceuta', 'Melilla', 'Badajoz', 'Cáceres', 'A Coruña',
       'Lugo', 'Ourense', 'Pontevedra', 'Illes Balears', 'La Rioja',
       'Madrid', 'Murcia', 'Navarra', 'Álava', 'Guipúzcoa', 'Vizcaya',
       'Alicante/Alacant', 'Castellón/Castelló', 'Valencia/València'],
      dtype=object)

In [445]:
df_communities["PROVINCIA"].unique()


array(['Álava', 'Albacete', 'Alicante', 'Almería', 'Ávila', 'Badajoz',
       'Illes Balears', 'Barcelona', 'Burgos', 'Cáceres', 'Cádiz',
       'Castellón', 'Ciudad Real', 'Córdoba', 'A Coruña', 'Cuenca',
       'Girona', 'Granada', 'Guadalajara', 'Gipuzkoa', 'Huelva', 'Huesca',
       'Jaén', 'León', 'Lleida', 'La Rioja', 'Lugo', 'Madrid', 'Málaga',
       'Murcia', 'Navarra', 'Ourense', 'Asturias', 'Palencia',
       'Palmas, Las', 'Pontevedra', 'Salamanca', 'Santa Cruz de Tenerife',
       'Cantabria', 'Segovia', 'Sevilla', 'Soria', 'Tarragona', 'Teruel',
       'Toledo', 'Valencia', 'Valladolid', 'Bizkaia', 'Zamora',
       'Zaragoza', 'Ceuta', 'Melilla'], dtype=object)

In [446]:
df_communities.head()

Unnamed: 0,CPRO,PROVINCIA,CMUN,NOMBRE,POB24,HOMBRES,MUJERES
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440
1,1,Álava,2,Amurrio,10330,5149,5181
2,1,Álava,3,Aramaio,1381,709,672
3,1,Álava,4,Artziniega,1856,913,943
4,1,Álava,6,Armiñón,247,127,120


In [447]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [448]:
df_coordinates.sample(5)

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female
6312,Galicia,A Coruña,Mazaricos,42.93893,-8.992199,309.7912,4939,2373,2566
1901,Castilla La Mancha,Ciudad Real,Granátula de Calatrava,38.79531,-3.743112,659.4633,941,457,484
6833,Madrid,Madrid,Ambite,40.3292,-3.181895,667.338,557,292,265
6839,Madrid,Madrid,Batres,40.21029,-3.922307,603.6849,1466,755,711
995,Aragón,Teruel,Andorra,40.97544,-0.443657,721.1627,8403,4366,4037


In [449]:
df_coordinates = replace_with(df_coordinates, 'province', 'Vizcaya', 'Bizkaia')
df_coordinates = replace_with(df_coordinates, 'province', 'Alicante/Alacant', 'Alicante')
df_coordinates = replace_with(df_coordinates, 'province', 'Castellón/Castelló', 'Castellón')
df_coordinates = replace_with(df_coordinates, 'province', 'Valencia/València', 'Valencia')
df_coordinates = replace_with(df_coordinates, 'province', 'Guipúzcoa', 'Gipuzkoa')

In [450]:
cleaner = AccentCleaner([df_communities, df_coordinates], ['municipality_name', 'province'])
cleaner.cleanAccents()

aligner = ColumnAligner(df_communities, df_coordinates, 'municipality_name_clean', re.compile(r"\*,\s"))
aligner.alignColumns()

df_communities = split_at_char(df_communities, 'municipality_name_clean', '/')
df_coordinates = split_at_char(df_coordinates, 'municipality_name_clean', '/')

In [451]:
df_coordinates[df_coordinates['municipality_name_clean'] == "arantzazu"]


Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
7465,País Vasco,Bizkaia,Arantzazu,43.1529,-2.79043,98.17124,305,150,155,arantzazu,bizkaia


In [452]:
df_communities['municipality_name_clean'] = df_communities['municipality_name_clean'].str.strip().str.lower()
df_communities['province_clean'] = df_communities['province_clean'].str.strip().str.lower()
df_coordinates['municipality_name_clean'] = df_coordinates['municipality_name_clean'].str.strip().str.lower()
df_coordinates['province_clean'] = df_coordinates['province_clean'].str.strip().str.lower()

df = pd.merge(df_communities, df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], 
              on=['municipality_name_clean', 'province_clean'], 
              how='left')

# df = pd.merge(df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], df_communities, 
#               on=['municipality_name_clean', 'province_clean'], 
#               how='left')

In [453]:
df.head(10)

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.6857,-2.513507,42.84149
1,1,Álava,2,Amurrio,10330,5149,5181,amurrio,alava,219.691,-3.001022,43.05265
2,1,Álava,3,Aramaio,1381,709,672,aramaio,alava,381.8797,-2.566,43.054
3,1,Álava,4,Artziniega,1856,913,943,artziniega,alava,196.9808,-3.128209,43.1222
4,1,Álava,6,Armiñón,247,127,120,arminon,alava,463.5815,-2.872574,42.72305
5,1,Álava,8,Arratzua-Ubarrundia,1047,552,495,arratzua-ubarrundia,alava,,,
6,1,Álava,9,Asparrena,1611,813,798,asparrena,alava,631.652,-2.321,42.89567
7,1,Álava,10,Ayala/Aiara,2942,1491,1451,ayala,alava,300.2789,-3.063056,43.08333
8,1,Álava,11,Baños de Ebro/Mañueta,291,168,123,banos de ebro,alava,428.0221,-2.679144,42.53029
9,1,Álava,13,Barrundia,879,490,389,barrundia,alava,555.0674,-2.491817,42.91669


In [454]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
df.dropna(inplace=True)

In [455]:
df.head(10)

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.6857,-2.513507,42.84149
1,1,Álava,2,Amurrio,10330,5149,5181,amurrio,alava,219.691,-3.001022,43.05265
2,1,Álava,3,Aramaio,1381,709,672,aramaio,alava,381.8797,-2.566,43.054
3,1,Álava,4,Artziniega,1856,913,943,artziniega,alava,196.9808,-3.128209,43.1222
4,1,Álava,6,Armiñón,247,127,120,arminon,alava,463.5815,-2.872574,42.72305
6,1,Álava,9,Asparrena,1611,813,798,asparrena,alava,631.652,-2.321,42.89567
7,1,Álava,10,Ayala/Aiara,2942,1491,1451,ayala,alava,300.2789,-3.063056,43.08333
8,1,Álava,11,Baños de Ebro/Mañueta,291,168,123,banos de ebro,alava,428.0221,-2.679144,42.53029
9,1,Álava,13,Barrundia,879,490,389,barrundia,alava,555.0674,-2.491817,42.91669
10,1,Álava,14,Berantevilla,459,238,221,berantevilla,alava,468.8318,-2.860401,42.68245


## Problems

Spelling is different in the two dataset, we might have to remove the accents first

In [456]:
df_communities[df_communities['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
768,6,Badajoz,139,Valencia de las Torres,480,256,224,valencia de las torres,badajoz
769,6,Badajoz,140,Valencia del Mombuey,713,369,344,valencia del mombuey,badajoz
770,6,Badajoz,141,Valencia del Ventoso,1885,953,932,valencia del ventoso,badajoz
1745,10,Cáceres,203,Valencia de Alcántara,5196,2562,2634,valencia de alcantara,caceres
3779,24,León,188,Valencia de Don Juan,5094,2470,2624,valencia de don juan,leon
7234,46,Valencia,250,València,825948,391970,433978,valencia,valencia


In [457]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [458]:
df.query("municipality_name_clean == 'villatuelda'")
df.info()

df.sample()

<class 'pandas.core.frame.DataFrame'>
Index: 7952 entries, 0 to 8132
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   province_code            7952 non-null   int64  
 1   province                 7952 non-null   object 
 2   municipality_code        7952 non-null   int64  
 3   municipality_name        7952 non-null   object 
 4   population               7952 non-null   int64  
 5   male                     7952 non-null   int64  
 6   female                   7952 non-null   int64  
 7   municipality_name_clean  7952 non-null   object 
 8   province_clean           7952 non-null   object 
 9   altitude                 7952 non-null   float64
 10  longitude                7952 non-null   float64
 11  latitude                 7952 non-null   float64
dtypes: float64(3), int64(5), object(4)
memory usage: 807.6+ KB


Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
1240,9,Burgos,82,Castildelgado,35,20,15,castildelgado,burgos,768.4348,-3.084511,42.43724


In [459]:
def get_zeros(stringlength, fill_length):
    if stringlength < fill_length:
        return '0'* (fill_length - stringlength)
    return None


df["province_code"] = df["province_code"].astype(str)
df["municipality_code"] = df["municipality_code"].astype(str)

df["province_code"] = df["province_code"].apply(lambda x: '0' + x if len(x) < 2 else x )
df["municipality_code"] = df["municipality_code"].apply(lambda x: get_zeros(len(x), 3) + x if len(x) < 3 else x)

df['cmun'] = df["province_code"] + df["municipality_code"]
df.cmun = df.cmun.astype("int32")

In [460]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude,cmun
0,01,Álava,001,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.68570,-2.513507,42.84149,1001
1,01,Álava,002,Amurrio,10330,5149,5181,amurrio,alava,219.69100,-3.001022,43.05265,1002
2,01,Álava,003,Aramaio,1381,709,672,aramaio,alava,381.87970,-2.566000,43.05400,1003
3,01,Álava,004,Artziniega,1856,913,943,artziniega,alava,196.98080,-3.128209,43.12220,1004
4,01,Álava,006,Armiñón,247,127,120,arminon,alava,463.58150,-2.872574,42.72305,1006
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8128,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749,50901
8129,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059,50902
8130,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518,50903
8131,51,Ceuta,001,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829,51001


In [461]:
df = df.drop(columns=['province_code', 'municipality_code'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7952 entries, 0 to 8132
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   province                 7952 non-null   object 
 1   municipality_name        7952 non-null   object 
 2   population               7952 non-null   int64  
 3   male                     7952 non-null   int64  
 4   female                   7952 non-null   int64  
 5   municipality_name_clean  7952 non-null   object 
 6   province_clean           7952 non-null   object 
 7   altitude                 7952 non-null   float64
 8   longitude                7952 non-null   float64
 9   latitude                 7952 non-null   float64
 10  cmun                     7952 non-null   int32  
dtypes: float64(3), int32(1), int64(3), object(4)
memory usage: 714.4+ KB


In [462]:
df.query("cmun <= 20000")
df.query("municipality_name_clean == 'villatuelda'")

Unnamed: 0,province,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude,cmun
1524,Burgos,Villatuelda,53,28,25,villatuelda,burgos,858.7932,-3.882187,41.81512,9464


In [463]:
df.to_csv("../data/processed/filtered_municipalities.csv", index=False)