## Basic EDA for first incoming data

In [19]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [20]:
df_communities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [21]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [22]:
df_coordinates.sample(5)

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female
5379,Catalunya,Girona,Queralbs,42.34953,2.16335,1212.268,199,115,84
4958,Catalunya,Barcelona,Bellprat,41.51725,1.433458,644.4128,92,50,42
3356,Castilla León,León,Chozas de Abajo,42.50726,-5.686552,882.439,2419,1258,1161
1301,Aragón,Zaragoza,Daroca,41.11533,-1.413934,783.9178,2331,1156,1175
929,Aragón,Huesca,San Miguel del Cinca,41.82222,0.076111,264.1327,853,438,415


In [23]:
df_communities.sample(5)

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female
5211,34,Palencia,204,Villacidaler,46,26,20
4074,26,La Rioja,26,Baños de Río Tobía,1605,820,785
6340,42,Soria,195,Valdenebro,94,55,39
6864,45,Toledo,85,Lominchar,2815,1512,1303
155,3,Alicante,18,Altea,23963,11693,12270


In [24]:
df_coordinates = replace_with(df_coordinates, 'province', 'Vizcaya', 'Bizcaia')
df_coordinates[df_coordinates['province'] == "Bizcaia"]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female
7458,País Vasco,Bizcaia,Abadiño,43.15000,-2.610278,139.52980,7260,3647,3613
7459,País Vasco,Bizcaia,Abanto y Ciérvana-Abanto Zierbena,43.31688,-3.087678,65.98425,9647,4779,4868
7460,País Vasco,Bizcaia,Ajangiz,43.30100,-2.671000,13.37307,445,216,229
7461,País Vasco,Bizcaia,Alonsotegi,43.24519,-2.988702,23.90280,2835,1390,1445
7462,País Vasco,Bizcaia,Amorebieta-Etxano,43.22050,-2.733236,74.85185,17842,8793,9049
...,...,...,...,...,...,...,...,...,...
7565,País Vasco,Bizcaia,Zaratamo,43.21151,-2.873626,167.18530,1735,877,858
7566,País Vasco,Bizcaia,Zeanuri,43.09966,-2.749615,172.21160,1330,686,644
7567,País Vasco,Bizcaia,Zeberio,43.15278,-2.852778,184.42850,1054,523,531
7568,País Vasco,Bizcaia,Zierbena,43.34766,-3.086152,95.27987,1382,753,629


In [25]:
cleaner = AccentCleaner([df_communities, df_coordinates], ['municipality_name', 'province'])
cleaner.cleanAccents()


aligner = ColumnAligner(df_communities, df_coordinates, 'municipality_name_clean', re.compile(r"\*,\s"))
aligner.alignColumns()

df_communities = split_at_char(df_communities, 'province_clean', '/')
df_communities = split_at_char(df_communities, 'municipality_name_clean', '/')
df_coordinates = split_at_char(df_coordinates, 'province_clean', '/')
df_coordinates = split_at_char(df_coordinates, 'municipality_name_clean', '/')

In [26]:
df_coordinates[df_coordinates['province'] == "Bizcaia"]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
7458,País Vasco,Bizcaia,Abadiño,43.15000,-2.610278,139.52980,7260,3647,3613,abadino,bizcaia
7459,País Vasco,Bizcaia,Abanto y Ciérvana-Abanto Zierbena,43.31688,-3.087678,65.98425,9647,4779,4868,abanto y ciervana-abanto zierbena,bizcaia
7460,País Vasco,Bizcaia,Ajangiz,43.30100,-2.671000,13.37307,445,216,229,ajangiz,bizcaia
7461,País Vasco,Bizcaia,Alonsotegi,43.24519,-2.988702,23.90280,2835,1390,1445,alonsotegi,bizcaia
7462,País Vasco,Bizcaia,Amorebieta-Etxano,43.22050,-2.733236,74.85185,17842,8793,9049,amorebieta-etxano,bizcaia
...,...,...,...,...,...,...,...,...,...,...,...
7565,País Vasco,Bizcaia,Zaratamo,43.21151,-2.873626,167.18530,1735,877,858,zaratamo,bizcaia
7566,País Vasco,Bizcaia,Zeanuri,43.09966,-2.749615,172.21160,1330,686,644,zeanuri,bizcaia
7567,País Vasco,Bizcaia,Zeberio,43.15278,-2.852778,184.42850,1054,523,531,zeberio,bizcaia
7568,País Vasco,Bizcaia,Zierbena,43.34766,-3.086152,95.27987,1382,753,629,zierbena,bizcaia


In [27]:
df_communities
df_communities[df_communities['municipality_name_clean'] == "pedrosas, las"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
8034,50,Zaragoza,205,"Pedrosas, Las",113,56,57,"pedrosas, las",zaragoza


In [28]:
df_coordinates[df_coordinates['municipality_name'] == "Amurrio"]
df_coordinates[df_coordinates['municipality_name_clean'] == "pedrosas"]
df_coordinates[df_coordinates['municipality_name'].str.contains(r'pedrosas', case=False, na=False)]

df_coordinates

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
0,Andalucía,Almería,Abla,37.14114,-2.780104,871.16840,1504,783,721,abla,almeria
1,Andalucía,Almería,Abrucena,37.13305,-2.797098,976.93870,1341,682,659,abrucena,almeria
2,Andalucía,Almería,Adra,36.74807,-3.022522,10.97898,24373,12338,12035,adra,almeria
3,Andalucía,Almería,Albánchez,37.28710,-2.181163,481.31230,815,422,393,albanchez,almeria
4,Andalucía,Almería,Alboloduy,37.03319,-2.621750,388.43460,674,334,340,alboloduy,almeria
...,...,...,...,...,...,...,...,...,...,...,...
8107,Valencia,Valencia/València,Xeresa,39.00910,-0.217992,34.05847,2221,1124,1097,xeresa,valencia
8108,Valencia,Valencia/València,Xirivella,39.46669,-0.427794,35.01973,30691,15474,15217,xirivella,valencia
8109,Valencia,Valencia/València,Yátova,39.38500,-0.808174,441.81140,2199,1140,1059,yatova,valencia
8110,Valencia,Valencia/València,Yesa (La),39.49916,-0.426125,52.28455,260,141,119,"yesa, la",valencia


In [29]:
df = pd.merge(df_communities, df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], 
              on=['municipality_name_clean', 'province_clean'], 
              how='left')

# df = pd.merge(df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], df_communities, 
#               on=['municipality_name_clean', 'province_clean'], 
#               how='left')

In [30]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.68570,-2.513507,42.84149
1,1,Álava,2,Amurrio,10330,5149,5181,amurrio,alava,219.69100,-3.001022,43.05265
2,1,Álava,3,Aramaio,1381,709,672,aramaio,alava,381.87970,-2.566000,43.05400
3,1,Álava,4,Artziniega,1856,913,943,artziniega,alava,196.98080,-3.128209,43.12220
4,1,Álava,6,Armiñón,247,127,120,arminon,alava,463.58150,-2.872574,42.72305
...,...,...,...,...,...,...,...,...,...,...,...,...
8128,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749
8129,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059
8130,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518
8131,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829


In [31]:
valencia = df_coordinates[df_coordinates['province_clean'] == 'valencia']
valencia

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
7846,Valencia,Valencia/València,Ademuz,40.06174,-1.286526,753.28660,1286,689,597,ademuz,valencia
7847,Valencia,Valencia/València,Ador,38.91941,-0.224290,99.20180,1474,766,708,ador,valencia
7848,Valencia,Valencia/València,Agullent,38.82132,-0.548935,426.09100,2449,1199,1250,agullent,valencia
7849,Valencia,Valencia/València,Aielo de Malferit,38.87661,-0.592110,282.56170,4679,2339,2340,aielo de malferit,valencia
7850,Valencia,Valencia/València,Aielo de Rugat,38.88166,-0.343009,267.10810,189,104,85,aielo de rugat,valencia
...,...,...,...,...,...,...,...,...,...,...,...
8107,Valencia,Valencia/València,Xeresa,39.00910,-0.217992,34.05847,2221,1124,1097,xeresa,valencia
8108,Valencia,Valencia/València,Xirivella,39.46669,-0.427794,35.01973,30691,15474,15217,xirivella,valencia
8109,Valencia,Valencia/València,Yátova,39.38500,-0.808174,441.81140,2199,1140,1059,yatova,valencia
8110,Valencia,Valencia/València,Yesa (La),39.49916,-0.426125,52.28455,260,141,119,"yesa, la",valencia


In [32]:
valencia = df[df['province_clean'] == 'valencia']
valencia.sample(10)

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
7106,46,Valencia,121,Estubeny,109,54,55,estubeny,valencia,167.0915,-0.623651,39.01794
7052,46,Valencia,67,Benissanó,2427,1186,1241,benissano,valencia,,,
7006,46,Valencia,21,Aldaia,34035,17136,16899,aldaia,valencia,49.20547,-0.461358,39.46505
7181,46,Valencia,196,Pinet,148,72,76,pinet,valencia,352.6049,-0.338099,38.98334
7124,46,Valencia,139,Guadassuar,5992,2901,3091,guadassuar,valencia,33.0038,-0.479225,39.18823
7241,46,Valencia,256,Vilamarxant,11041,5729,5312,vilamarxant,valencia,106.5428,-0.622273,39.56751
7159,46,Valencia,174,Montesa,1134,571,563,montesa,valencia,286.9374,-0.651104,38.94968
7071,46,Valencia,86,Carrícola,99,49,50,carricola,valencia,336.8991,-0.471664,38.84059
7053,46,Valencia,68,Benissoda,485,260,225,benissoda,valencia,334.6183,-0.530893,38.8327
7250,46,Valencia,903,San Antonio de Benagéber,10630,5227,5403,san antonio de benageber,valencia,117.6917,-0.500475,39.56189


In [34]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
nan_rows.sample(15)

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
4359,28,Madrid,70,Horcajo de la Sierra-Aoslos,233,113,120,horcajo de la sierra-aoslos,madrid,,,
7498,48,Bizkaia,22,Karrantza Harana/Valle de Carranza,2711,1418,1293,karrantza harana,bizkaia,,,
3160,20,Gipuzkoa,21,Belauntza,288,150,138,belauntza,gipuzkoa,,,
7556,48,Bizkaia,80,Valle de Trápaga-Trapagaran,11898,5804,6094,valle de trapaga-trapagaran,bizkaia,,,
3161,20,Gipuzkoa,22,Berastegi,1111,575,536,berastegi,gipuzkoa,,,
3173,20,Gipuzkoa,34,Eskoriatza,4241,2131,2110,eskoriatza,gipuzkoa,,,
7207,46,Valencia,222,Sant Joanet,526,256,270,sant joanet,valencia,,,
3172,20,Gipuzkoa,33,Elgeta,1130,564,566,elgeta,gipuzkoa,,,
7530,48,Bizkaia,54,Leioa,32683,15979,16704,leioa,bizkaia,,,
1931,12,Castellón,129,Vilafranca/Villafranca del Cid,2139,1088,1051,vilafranca,castellon,,,


In [16]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.68570,-2.513507,42.84149
1,1,Álava,2,Amurrio,10330,5149,5181,amurrio,alava,219.69100,-3.001022,43.05265
2,1,Álava,3,Aramaio,1381,709,672,aramaio,alava,381.87970,-2.566000,43.05400
3,1,Álava,4,Artziniega,1856,913,943,artziniega,alava,196.98080,-3.128209,43.12220
4,1,Álava,6,Armiñón,247,127,120,arminon,alava,463.58150,-2.872574,42.72305
...,...,...,...,...,...,...,...,...,...,...,...,...
7752,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749
7753,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059
7754,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518
7755,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829


## Problems

Spelling is different in the two dataset, we might have to remove the accents first

In [None]:
df_communities[df_communities['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

In [None]:
df_coordinates[df_coordinates['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

In [17]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [None]:
df[df["municipality_name_clean"] == "valencia"]

In [None]:
df_communities[df_communities["municipality_name"] == "València"]

In [None]:
df_coordinates[df_coordinates["municipality_name"] == "Valencia"]