## Basic EDA for first incoming data

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np

In [None]:
df_communities = pd.read_csv("../data/population_towns.csv", sep=";")
df_coordinates = pd.read_csv(
    "../data/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)


In [None]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [None]:
df_communities[df_communities['municipality_name'] == "Galapagar"]

In [None]:
df_coordinates[df_coordinates['municipality_name'] == "Galapagar"]

In [None]:
df = pd.merge(df_communities, df_coordinates[["latitude", "longitude", "altitude", "municipality_name"]], 
              on='municipality_name', 
              how='left')

In [None]:
df.info()

In [None]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
nan_rows

## Problems

Spelling is different in the two dataset, we might have to remove the accents first

In [456]:
df_communities[df_communities['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female
768,6,Badajoz,139,Valencia de las Torres,480,256,224
769,6,Badajoz,140,Valencia del Mombuey,713,369,344
770,6,Badajoz,141,Valencia del Ventoso,1885,953,932
1745,10,Cáceres,203,Valencia de Alcántara,5196,2562,2634
3779,24,León,188,Valencia de Don Juan,5094,2470,2624
7234,46,Valencia/València,250,València,825948,391970,433978


In [457]:
df_coordinates[df_coordinates['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female
3479,Castilla León,León,Valencia de Don Juan,42.29401,-5.519861,767.8869,5083,2505,2578
6024,Extremadura,Badajoz,Valencia de las Torres,38.405,-6.003782,514.027,687,358,329
6025,Extremadura,Badajoz,Valencia del Mombuey,38.24256,-7.119643,295.6457,782,400,382
6026,Extremadura,Badajoz,Valencia del Ventoso,38.265,-6.474613,496.0627,2249,1130,1119
6250,Extremadura,Cáceres,Valencia de Alcántara,39.41135,-7.246899,462.1811,6178,3070,3108
8094,Valencia,Valencia/València,Valencia,39.47024,-0.376805,23.3349,814208,392300,421908


In [None]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [None]:
df[df["municipality_name"] == "València"]

In [None]:
df_communities[df_communities["municipality_name"] == "València"]

In [None]:
df_coordinates[df_coordinates["municipality_name"] == "Valencia"]