## Basic EDA for first incoming data

In [54]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accents_cleaner import AccentsCleaner

In [55]:
df_communities = pd.read_csv("../data/population_towns.csv", sep=";")
df_coordinates = pd.read_csv(
    "../data/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [56]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [57]:
cleaner = AccentsCleaner([df_communities, df_coordinates], ['municipality_name', 'province'])
cleaner.clean()

df_coordinates[df_coordinates['municipality_name_clean'] == 'valencia']
df_coordinates[df_coordinates['municipality_name_clean'].duplicated()]
df_coordinates
df_communities

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
0,1,Araba/Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,araba/alava
1,1,Araba/Álava,2,Amurrio,10330,5149,5181,amurrio,araba/alava
2,1,Araba/Álava,3,Aramaio,1381,709,672,aramaio,araba/alava
3,1,Araba/Álava,4,Artziniega,1856,913,943,artziniega,araba/alava
4,1,Araba/Álava,6,Armiñón,247,127,120,arminon,araba/alava
...,...,...,...,...,...,...,...,...,...
8127,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza
8128,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza
8129,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza
8130,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta


In [58]:
df_coordinates[df_coordinates['province_clean'] == 'araba']

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean


In [59]:
df_communities
df_communities[df_communities['municipality_name_clean'] == "pedrosas, las"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
8034,50,Zaragoza,205,"Pedrosas, Las",113,56,57,"pedrosas, las",zaragoza


In [60]:
df_coordinates[df_coordinates['municipality_name'] == "Amurrio"]
df_coordinates[df_coordinates['municipality_name_clean'] == "pedrosas"]
df_coordinates[df_coordinates['municipality_name'].str.contains(r'pedrosas', case=False, na=False)]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
1407,Aragón,Zaragoza,Pedrosas (Las),42.03822,-0.876332,459.5541,120,64,56,pedrosas (las),zaragoza


In [61]:
df = pd.merge(df_communities, df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], 
              on=['municipality_name_clean', 'province_clean'], 
              how='left')

# df = pd.merge(df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], df_communities, 
#               on=['municipality_name_clean', 'province_clean'], 
#               how='left')

In [62]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Araba/Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,araba/alava,,,
1,1,Araba/Álava,2,Amurrio,10330,5149,5181,amurrio,araba/alava,,,
2,1,Araba/Álava,3,Aramaio,1381,709,672,aramaio,araba/alava,,,
3,1,Araba/Álava,4,Artziniega,1856,913,943,artziniega,araba/alava,,,
4,1,Araba/Álava,6,Armiñón,247,127,120,arminon,araba/alava,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
8127,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749
8128,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059
8129,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518
8130,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829


In [64]:
df.isnull().sum()
df = df.dropna()

In [65]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
nan_rows

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude


In [66]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
51,2,Albacete,1,Abengibre,759,374,385,abengibre,albacete,635.97630,-1.544182,39.20999
52,2,Albacete,2,Alatoz,497,267,230,alatoz,albacete,856.58100,-1.361155,39.09464
53,2,Albacete,3,Albacete,174137,85115,89022,albacete,albacete,685.95730,-1.860070,38.99765
54,2,Albacete,4,Albatana,663,342,321,albatana,albacete,583.85090,-1.524207,38.56977
55,2,Albacete,5,Alborea,676,363,313,alborea,albacete,700.43820,-1.395870,39.27951
...,...,...,...,...,...,...,...,...,...,...,...,...
8127,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749
8128,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059
8129,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518
8130,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829


## Problems

Spelling is different in the two dataset, we might have to remove the accents first

In [67]:
df_communities[df_communities['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
768,6,Badajoz,139,Valencia de las Torres,480,256,224,valencia de las torres,badajoz
769,6,Badajoz,140,Valencia del Mombuey,713,369,344,valencia del mombuey,badajoz
770,6,Badajoz,141,Valencia del Ventoso,1885,953,932,valencia del ventoso,badajoz
1745,10,Cáceres,203,Valencia de Alcántara,5196,2562,2634,valencia de alcantara,caceres
3779,24,León,188,Valencia de Don Juan,5094,2470,2624,valencia de don juan,leon
7234,46,Valencia/València,250,València,825948,391970,433978,valencia,valencia/valencia


In [68]:
df_coordinates[df_coordinates['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
3479,Castilla León,León,Valencia de Don Juan,42.29401,-5.519861,767.8869,5083,2505,2578,valencia de don juan,leon
6024,Extremadura,Badajoz,Valencia de las Torres,38.405,-6.003782,514.027,687,358,329,valencia de las torres,badajoz
6025,Extremadura,Badajoz,Valencia del Mombuey,38.24256,-7.119643,295.6457,782,400,382,valencia del mombuey,badajoz
6026,Extremadura,Badajoz,Valencia del Ventoso,38.265,-6.474613,496.0627,2249,1130,1119,valencia del ventoso,badajoz
6250,Extremadura,Cáceres,Valencia de Alcántara,39.41135,-7.246899,462.1811,6178,3070,3108,valencia de alcantara,caceres
8094,Valencia,Valencia/València,Valencia,39.47024,-0.376805,23.3349,814208,392300,421908,valencia,valencia/valencia


In [69]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [None]:
df[df["municipality_name"] == "València"]

In [None]:
df_communities[df_communities["municipality_name"] == "València"]

In [None]:
df_coordinates[df_coordinates["municipality_name"] == "Valencia"]