## Basic EDA for first incoming data

In [None]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [None]:
df_communities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [None]:
df_communities.info()

In [None]:
df_coordinates.info()

In [None]:
df_coordinates["Comunidad"].unique()

# We need to change:
# Catalunya -> Cataluña
# Ceuta y Melilla --> Ceuta
#                     Melilla


In [None]:
df_coordinates["Provincia"].unique()

# We need to change:
# 'Alicante/Alacant' --> Alicante
# 'Castellón/Castelló' -> Castellón
# 'Valencia/València' --> Valencia
# Guipúzcoa --> Gipuzkoa
# Vizcaya --> Bizkaia


In [None]:
df_communities["PROVINCIA"].unique()


In [None]:
df_communities.head()

In [None]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [None]:
df_coordinates.sample(5)

In [None]:
df_coordinates = replace_with(df_coordinates, 'province', 'Vizcaya', 'Bizkaia')
df_coordinates = replace_with(df_coordinates, 'province', 'Alicante/Alacant', 'Alicante')
df_coordinates = replace_with(df_coordinates, 'province', 'Castellón/Castelló', 'Castellón')
df_coordinates = replace_with(df_coordinates, 'province', 'Valencia/València', 'Valencia')
df_coordinates = replace_with(df_coordinates, 'province', 'Guipúzcoa', 'Gipuzkoa')

In [None]:
cleaner = AccentCleaner([df_communities, df_coordinates], ['municipality_name', 'province'])
cleaner.cleanAccents()

aligner = ColumnAligner(df_communities, df_coordinates, 'municipality_name_clean', re.compile(r"\*,\s"))
aligner.alignColumns()

df_communities = split_at_char(df_communities, 'municipality_name_clean', '/')
df_coordinates = split_at_char(df_coordinates, 'municipality_name_clean', '/')

In [None]:
df_coordinates[df_coordinates['municipality_name_clean'] == "arantzazu"]


In [None]:
df_communities['municipality_name_clean'] = df_communities['municipality_name_clean'].str.strip().str.lower()
df_communities['province_clean'] = df_communities['province_clean'].str.strip().str.lower()
df_coordinates['municipality_name_clean'] = df_coordinates['municipality_name_clean'].str.strip().str.lower()
df_coordinates['province_clean'] = df_coordinates['province_clean'].str.strip().str.lower()

df = pd.merge(df_communities, df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], 
              on=['municipality_name_clean', 'province_clean'], 
              how='left')

# df = pd.merge(df_coordinates[['altitude', 'longitude', 'latitude', 'municipality_name_clean', 'province_clean']], df_communities, 
#               on=['municipality_name_clean', 'province_clean'], 
#               how='left')

In [None]:
df.head(10)

In [None]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
df.dropna(inplace=True)

In [None]:
df.head(10)

## Problems

Spelling is different in the two dataset, we might have to remove the accents first

In [None]:
df_communities[df_communities['municipality_name'].str.contains(r'Val.*ncia', case=False, na=False)]

In [None]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [None]:
df.query("municipality_name_clean == 'villatuelda'")
df.info()

df.sample()

In [None]:
def get_zeros(stringlength, fill_length):
    if stringlength < fill_length:
        return '0'* (fill_length - stringlength)
    return None


df["province_code"] = df["province_code"].astype(str)
df["municipality_code"] = df["municipality_code"].astype(str)

df["province_code"] = df["province_code"].apply(lambda x: '0' + x if len(x) < 2 else x )
df["municipality_code"] = df["municipality_code"].apply(lambda x: get_zeros(len(x), 3) + x if len(x) < 3 else x)

df['cmun'] = df["province_code"] + df["municipality_code"]
df.cmun = df.cmun.astype("int32")

In [None]:
df

In [None]:
df = df.drop(columns=['province_code', 'municipality_code'])
df.info()

In [None]:
df.query("cmun <= 20000")
df.query("municipality_name_clean == 'villatuelda'")

In [None]:
df.to_csv("../data/processed/filtered_municipalities.csv", index=False)