## Basic EDA for first incoming data

In [128]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner

In [129]:
df_communities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [130]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [131]:
cleaner = AccentCleaner([df_communities, df_coordinates], ['municipality_name', 'province'])
cleaner.cleanAccents()

aligner = ColumnAligner(df_communities, df_coordinates, 'municipality_name_clean', re.compile(r"\*,\s"))
stuff = aligner.alignColumns()

commas = df_communities.municipality_name_clean.str.contains(r'.*, ')
extract_list = df_communities[commas]['municipality_name_clean'].str.split(',').str[0].to_list()

for elem in extract_list:
    # Create a mask for rows in df_coordinates that contain the element
    mask = df_coordinates['municipality_name_clean'].str.contains(elem + ' ', case=False, na=False)
    # # Get the corresponding full name from df_communities
    full_name = df_communities[df_communities['municipality_name_clean'].str.contains(f"{elem},", case=False, na=False)]['municipality_name_clean'].iloc[0]
    # Update df_coordinates with the full name where mask is True
    df_coordinates.loc[mask, 'municipality_name_clean'] = full_name

# df_coordinates.sample(50)

In [132]:
df_coordinates.sample(30)

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
328,Andalucía,Granada,Moclín,37.34158,-3.786422,1061.082,4268,2122,2146,moclin,granada
6432,Galicia,Ourense,Baltar,41.94975,-7.716806,823.1027,1144,568,576,baltar,ourense
3303,Castilla León,Burgos,Zael,42.10968,-3.823925,843.672,117,62,55,zael,burgos
2541,Castilla La Mancha,Toledo,Dosbarrios,39.88408,-3.476729,709.6131,2497,1271,1226,dosbarrios,toledo
1654,Canarias,Santa Cruz de Tenerife,Sauzal (El),28.47941,-16.43661,323.1106,8996,4469,4527,"sauzal, el",santa cruz de tenerife
7124,Navarra,Navarra,Castillonuevo,42.67852,-1.044187,774.6876,18,12,6,castillonuevo,navarra
3261,Castilla León,Burgos,Vid y Barrios (La),41.62967,-3.490485,821.0573,276,159,117,"vid y barrios, la",burgos
537,Andalucía,Jaén,Puerta de Segura (La),38.34861,-2.737147,584.877,2638,1313,1325,"puerta de segura, la",jaen
5436,Catalunya,Girona,Toses,42.32076,2.014152,1426.833,160,98,62,toses,girona
7810,Valencia,Castellón/Castelló,Sot de Ferrer,39.80494,-0.410224,235.5191,460,239,221,sot de ferrer,castellon/castello


In [133]:
df_communities
df_communities[df_communities['municipality_name_clean'] == "pedrosas, las"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
8034,50,Zaragoza,205,"Pedrosas, Las",113,56,57,"pedrosas, las",zaragoza


In [134]:
df_coordinates[df_coordinates['municipality_name'] == "Amurrio"]
df_coordinates[df_coordinates['municipality_name_clean'] == "pedrosas"]
df_coordinates[df_coordinates['municipality_name'].str.contains(r'pedrosas', case=False, na=False)]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
1407,Aragón,Zaragoza,Pedrosas (Las),42.03822,-0.876332,459.5541,120,64,56,"pedrosas, las",zaragoza


In [136]:
df = pd.merge(df_communities, df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], 
              on=['municipality_name_clean', 'province_clean'], 
              how='left')

# df = pd.merge(df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], df_communities, 
#               on=['municipality_name_clean', 'province_clean'], 
#               how='left')

In [137]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.68570,-2.513507,42.84149
1,1,Álava,2,Amurrio,10330,5149,5181,amurrio,alava,219.69100,-3.001022,43.05265
2,1,Álava,3,Aramaio,1381,709,672,aramaio,alava,381.87970,-2.566000,43.05400
3,1,Álava,4,Artziniega,1856,913,943,artziniega,alava,196.98080,-3.128209,43.12220
4,1,Álava,6,Armiñón,247,127,120,arminon,alava,463.58150,-2.872574,42.72305
...,...,...,...,...,...,...,...,...,...,...,...,...
8136,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749
8137,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059
8138,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518
8139,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829


In [138]:
df.isnull().sum()

province_code                 0
province                      0
municipality_code             0
municipality_name             0
population                    0
male                          0
female                        0
municipality_name_clean       0
province_clean                0
altitude                   1203
longitude                  1203
latitude                   1203
dtype: int64

In [139]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
nan_rows

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
5,1,Álava,8,Arratzua-Ubarrundia,1047,552,495,arratzua-ubarrundia,alava,,,
28,1,Álava,39,Moreda de Álava/Moreda Araba,215,114,101,moreda de alava/moreda araba,alava,,,
34,1,Álava,47,Ribera Baja/Erriberabeitia,1439,767,672,ribera baja/erriberabeitia,alava,,,
36,1,Álava,51,Agurain/Salvatierra,5155,2601,2554,agurain/salvatierra,alava,,,
41,1,Álava,56,Harana/Valle de Arana,214,117,97,harana/valle de arana,alava,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7906,50,Zaragoza,62,"Burgo de Ebro, El",2704,1351,1353,"burgo de ebro, el",zaragoza,,,
7915,50,Zaragoza,71,Campillo de Aragón,112,67,45,campillo de aragon,zaragoza,,,
7965,50,Zaragoza,124,Herrera de los Navarros,504,284,220,herrera de los navarros,zaragoza,,,
7970,50,Zaragoza,130,Jarque de Moncayo,381,201,180,jarque de moncayo,zaragoza,,,


In [140]:
df

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440,alegria-dulantzi,alava,561.68570,-2.513507,42.84149
1,1,Álava,2,Amurrio,10330,5149,5181,amurrio,alava,219.69100,-3.001022,43.05265
2,1,Álava,3,Aramaio,1381,709,672,aramaio,alava,381.87970,-2.566000,43.05400
3,1,Álava,4,Artziniega,1856,913,943,artziniega,alava,196.98080,-3.128209,43.12220
4,1,Álava,6,Armiñón,247,127,120,arminon,alava,463.58150,-2.872574,42.72305
...,...,...,...,...,...,...,...,...,...,...,...,...
8136,50,Zaragoza,901,Biel,170,104,66,biel,zaragoza,754.24450,-0.936588,42.38749
8137,50,Zaragoza,902,Marracos,85,43,42,marracos,zaragoza,404.73610,-0.776047,42.09059
8138,50,Zaragoza,903,Villamayor de Gállego,2854,1446,1408,villamayor de gallego,zaragoza,231.12340,-0.773315,41.68518
8139,51,Ceuta,1,Ceuta,83229,41980,41249,ceuta,ceuta,13.47725,-5.316195,35.88829


## Problems

Spelling is different in the two dataset, we might have to remove the accents first

In [141]:
df_communities[df_communities['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
768,6,Badajoz,139,Valencia de las Torres,480,256,224,valencia de las torres,badajoz
769,6,Badajoz,140,Valencia del Mombuey,713,369,344,valencia del mombuey,badajoz
770,6,Badajoz,141,Valencia del Ventoso,1885,953,932,valencia del ventoso,badajoz
1745,10,Cáceres,203,Valencia de Alcántara,5196,2562,2634,valencia de alcantara,caceres
3779,24,León,188,Valencia de Don Juan,5094,2470,2624,valencia de don juan,leon
7234,46,Valencia,250,València,825948,391970,433978,valencia,valencia


In [142]:
df_coordinates[df_coordinates['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
3479,Castilla León,León,Valencia de Don Juan,42.29401,-5.519861,767.8869,5083,2505,2578,valencia de don juan,leon
6024,Extremadura,Badajoz,Valencia de las Torres,38.405,-6.003782,514.027,687,358,329,valencia de las torres,badajoz
6025,Extremadura,Badajoz,Valencia del Mombuey,38.24256,-7.119643,295.6457,782,400,382,valencia del mombuey,badajoz
6026,Extremadura,Badajoz,Valencia del Ventoso,38.265,-6.474613,496.0627,2249,1130,1119,valencia del ventoso,badajoz
6250,Extremadura,Cáceres,Valencia de Alcántara,39.41135,-7.246899,462.1811,6178,3070,3108,valencia de alcantara,caceres
8094,Valencia,Valencia/València,Valencia,39.47024,-0.376805,23.3349,814208,392300,421908,valencia,valencia/valencia


In [108]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [125]:
df[df["municipality_name_clean"] == "valencia"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean,altitude,longitude,latitude
7243,46,Valencia,250,València,825948,391970,433978,valencia,valencia,,,


In [110]:
df_communities[df_communities["municipality_name"] == "València"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,municipality_name_clean,province_clean
7234,46,Valencia,250,València,825948,391970,433978,valencia,valencia


In [126]:
df_coordinates[df_coordinates["municipality_name"] == "Valencia"]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female,municipality_name_clean,province_clean
8094,Valencia,Valencia/València,Valencia,39.47024,-0.376805,23.3349,814208,392300,421908,valencia,valencia/valencia
