## Geographical Data and Main Information of Municipalities

### Main notes

* We have had a lot of problems when matching different languages
* From the original dataset, we have lost 97 municipalities that we weren't able to match with coordinates. 
* We have dropped Population data from this dataset, as it will be covered in a different notebook (demographics)

In [1]:
import sys
import os
import re
from unidecode import unidecode

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [2]:
df_municipalities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_municipalities_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [3]:
df_municipalities.shape

(8132, 7)

In [4]:
df_municipalities_coordinates.shape

(8112, 9)

In [5]:
df_municipalities.rename(columns={
                          "PROVINCIA": "province",
                          "NOMBRE": "municipality",
                          "POB24": "population",
                          "MUJERES": "female",
                          "HOMBRES": "male" }, inplace=True)

In [6]:
df_municipalities_coordinates.rename(columns={"Comunidad": "autonomous_community",
                          "Provincia": "province",
                            "Población": "municipality",
                          "Habitantes": "population",
                            "Hombres": "male",
                            "Mujeres": "female",
                          "Latitud": "latitude",
                          "Longitud": "longitude",
                          "Altitud": "altitude" }, inplace=True)

In [7]:
df_municipalities = df_municipalities.drop(["population", "female", "male"], axis = 1)
df_municipalities_coordinates = df_municipalities_coordinates.drop(["population", "female", "male"], axis = 1)

In [8]:
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Vizcaya', 'Bizkaia')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Alicante/Alacant', 'Alicante')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Castellón/Castelló', 'Castellón')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Valencia/València', 'Valencia')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Guipúzcoa', 'Gipuzkoa')

In [9]:
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'autonomous_community', 'Catalunya', 'Cataluña')

# Split columns and clean

In [10]:
# Split municipality names into two columns
df_municipalities[['municipality_1', 'municipality_2']] = df_municipalities['municipality'].str.split('/', n=1, expand=True)

# Fill missing values in municipality_2 with "none" (as a string)
df_municipalities["municipality_2"] = df_municipalities["municipality_2"].fillna("none")


In [11]:
#split municipality names into two columns
df_municipalities_coordinates[['municipality_1', 'municipality_2']] = df_municipalities_coordinates['municipality'].str.split('/', n=1, expand=True)

# Fill missing values in municipality_2 with "none" (as a string)
df_municipalities_coordinates["municipality_2"] = df_municipalities_coordinates["municipality_2"].fillna("none")

In [12]:
# Function to clean a column (remove accents, normalize case)
def clean_column(column):
    return column.apply(lambda x: unidecode(str(x)).strip().lower() if pd.notna(x) else x)

# Apply cleaning to df_municipalities
df_municipalities["municipality_1"] = clean_column(df_municipalities["municipality_1"])
df_municipalities["municipality_2"] = clean_column(df_municipalities["municipality_2"])

# Apply cleaning to df_municipalities_coordinates
df_municipalities_coordinates["municipality_1"] = clean_column(df_municipalities_coordinates["municipality_1"])
df_municipalities_coordinates["municipality_2"] = clean_column(df_municipalities_coordinates["municipality_2"])

In [13]:
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace(" ","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace(",","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace("(","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace(")","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace("-","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace("ñ","n")

df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace(" ","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace(",","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace("(","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace(")","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace("-","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace("ñ","n")

In [14]:
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace(" ","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace(",","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace("(","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace(")","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace("-","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace("ñ","n")

df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace(" ","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace(",","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace("(","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace(")","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace("-","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace("ñ","n")

# Merging DFs

In [15]:
df_municipalities.loc[df_municipalities["municipality_2"].isna(), "language"] = "language"
df_municipalities_coordinates.loc[df_municipalities_coordinates["municipality_2"].isna(), "language"] = "language"

In [16]:
merge_1 = df_municipalities.merge(df_municipalities_coordinates, on = "municipality_1", how="left")

In [17]:
merge_1 = merge_1.dropna(subset=['latitude'])

In [18]:
merge_2 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_1", right_on="municipality_2", how="left")

In [19]:
merge_2 = merge_2.dropna(subset=['latitude'])

In [20]:
merged_df = pd.concat([merge_1, merge_2])

In [21]:
merged_df.shape

(8069, 17)

In [22]:
merged_df.isna().sum()

CPRO                       0
province_x                 0
CMUN                       0
municipality_x             0
municipality_1            28
municipality_2_x           0
language_x              8069
autonomous_community       0
province_y                 0
municipality_y             0
latitude                   0
longitude                  0
altitude                   0
municipality_2_y           0
language_y              8069
municipality_1_x        8041
municipality_1_y        8041
dtype: int64

In [23]:
merged_df.shape

(8069, 17)

In [24]:
merge_3 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_2", right_on="municipality_1", how="left")

In [25]:
merge_3 = merge_3.dropna(subset=['latitude'])

In [26]:
merged_df = pd.concat([merged_df, merge_3])

In [27]:
merged_df.shape

(8099, 17)

In [28]:
merged_df.isna().sum()

CPRO                       0
province_x                 0
CMUN                       0
municipality_x             0
municipality_1            58
municipality_2_x           0
language_x              8099
autonomous_community       0
province_y                 0
municipality_y             0
latitude                   0
longitude                  0
altitude                   0
municipality_2_y           0
language_y              8099
municipality_1_x        8041
municipality_1_y        8041
dtype: int64

In [29]:
df_municipalities_coordinates = df_municipalities_coordinates[df_municipalities_coordinates["municipality_2"] != "none"]
df_municipalities = df_municipalities[df_municipalities["municipality_2"] != "none"]


In [30]:
# Perform merges on all combinations
merge_4 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_2", right_on="municipality_2", how="left")

In [31]:
merge_4.isna().sum()

CPRO                      0
province_x                0
CMUN                      0
municipality_x            0
municipality_1_x          0
municipality_2            0
language_x              126
autonomous_community     62
province_y               62
municipality_y           62
latitude                 62
longitude                62
altitude                 62
municipality_1_y         62
language_y              126
dtype: int64

In [32]:
merge_4 = merge_4.dropna(subset=['latitude'])

In [33]:
# Concatenate results and drop duplicates
merged_df = pd.concat([merged_df, merge_4])

In [34]:
merged_df.duplicated().sum()

22

In [35]:
merged_df.drop_duplicates()

Unnamed: 0,CPRO,province_x,CMUN,municipality_x,municipality_1,municipality_2_x,language_x,autonomous_community,province_y,municipality_y,latitude,longitude,altitude,municipality_2_y,language_y,municipality_1_x,municipality_1_y,municipality_2
0,1,Álava,1,Alegría-Dulantzi,alegriadulantzi,none,,País Vasco,Álava,Alegría-Dulantzi,42.84149,-2.513507,561.6857,none,,,,
1,1,Álava,2,Amurrio,amurrio,none,,País Vasco,Álava,Amurrio,43.05265,-3.001022,219.6910,none,,,,
2,1,Álava,3,Aramaio,aramaio,none,,País Vasco,Álava,Aramaio,43.05400,-2.566000,381.8797,none,,,,
3,1,Álava,4,Artziniega,artziniega,none,,País Vasco,Álava,Artziniega,43.12220,-3.128209,196.9808,none,,,,
4,1,Álava,6,Armiñón,arminon,none,,País Vasco,Álava,Armiñón,42.72305,-2.872574,463.5815,none,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,31,Navarra,260,Valle de Yerri/Deierri,,,,Navarra,Navarra,Valle de Yerri/Deierri,42.71667,-2.000000,508.0046,,,valledeyerri,valledeyerri,deierri
116,31,Navarra,907,Zizur Mayor/Zizur Nagusia,,,,Navarra,Navarra,Zizur Mayor/Zizur Nagusia,42.78680,-1.690832,441.8872,,,zizurmayor,zizurmayor,zizurnagusia
119,46,Valencia,175,Montitxelvo/Montichelvo,,,,Valencia,Valencia,Montitxelvo/Montichelvo,38.89122,-0.338918,267.0021,,,montitxelvo,montitxelvo,montichelvo
124,48,Bizkaia,22,Karrantza Harana/Valle de Carranza,,,,País Vasco,Bizkaia,Karrantza Harana/Valle de Carranza,43.22528,-3.359444,173.5333,,,karrantzaharana,karrantzaharana,valledecarranza


In [36]:
merged_df = merged_df.drop_duplicates(subset=['municipality_x', 'municipality_y'])

In [37]:
merged_df.shape

(8022, 18)

In [38]:
merged_df.isna().sum()

CPRO                       0
province_x                 0
CMUN                       0
municipality_x             0
municipality_1            32
municipality_2_x           0
language_x              8022
autonomous_community       0
province_y                 0
municipality_y             0
latitude                   0
longitude                  0
altitude                   0
municipality_2_y           0
language_y              8022
municipality_1_x        7990
municipality_1_y        7990
municipality_2          8022
dtype: int64

---

## Now clean and match on municipalities SPA

In [39]:
merged_df.columns

Index(['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_1',
       'municipality_2_x', 'language_x', 'autonomous_community', 'province_y',
       'municipality_y', 'latitude', 'longitude', 'altitude',
       'municipality_2_y', 'language_y', 'municipality_1_x',
       'municipality_1_y', 'municipality_2'],
      dtype='object')

In [40]:
merged_df = merged_df[['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_1', 'municipality_2', 'municipality_1_y', 'municipality_2_y', 'autonomous_community',   'latitude', 'longitude', 'altitude']]

In [41]:
mask = (merged_df["municipality_1"].isna()) & (merged_df["municipality_2"].isna())

merged_df.loc[mask, "municipality_1"] = merged_df.loc[mask, "municipality_1_y"].copy()
merged_df.loc[mask, "municipality_2"] = merged_df.loc[mask, "municipality_2_y"].copy()


In [42]:
merged_df.isna().sum()

CPRO                       0
province_x                 0
CMUN                       0
municipality_x             0
municipality_1             0
municipality_2          7990
municipality_1_y        7990
municipality_2_y           0
autonomous_community       0
latitude                   0
longitude                  0
altitude                   0
dtype: int64

In [43]:
merged_df.columns

Index(['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_1',
       'municipality_2', 'municipality_1_y', 'municipality_2_y',
       'autonomous_community', 'latitude', 'longitude', 'altitude'],
      dtype='object')

In [44]:
merged_df = merged_df[['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_1',
       'municipality_2', 
       'autonomous_community', 'latitude', 'longitude', 'altitude']]

In [45]:
merged_df.rename(columns={      "province_x": "province",
                          "municipality_x": "municipality", 
                         }, inplace=True)

In [46]:
merged_df.shape

(8022, 10)

In [47]:
def get_zeros(stringlength, fill_length):
    if stringlength < fill_length:
        return '0'* (fill_length - stringlength)
    return None


merged_df["CPRO"] = merged_df["CPRO"].astype(str)
merged_df["CMUN"] = merged_df["CMUN"].astype(str)

merged_df["CPRO"] = merged_df["CPRO"].apply(lambda x: '0' + x if len(x) < 2 else x )
merged_df["CMUN"] = merged_df["CMUN"].apply(lambda x: get_zeros(len(x), 3) + x if len(x) < 3 else x)

merged_df['cmun'] = merged_df["CPRO"] + merged_df["CMUN"]
merged_df.cmun = merged_df.cmun.astype("int32")

In [48]:
merged_df

Unnamed: 0,CPRO,province,CMUN,municipality,municipality_1,municipality_2,autonomous_community,latitude,longitude,altitude,cmun
0,01,Álava,001,Alegría-Dulantzi,alegriadulantzi,,País Vasco,42.84149,-2.513507,561.68570,1001
1,01,Álava,002,Amurrio,amurrio,,País Vasco,43.05265,-3.001022,219.69100,1002
2,01,Álava,003,Aramaio,aramaio,,País Vasco,43.05400,-2.566000,381.87970,1003
3,01,Álava,004,Artziniega,artziniega,,País Vasco,43.12220,-3.128209,196.98080,1004
4,01,Álava,006,Armiñón,arminon,,País Vasco,42.72305,-2.872574,463.58150,1006
...,...,...,...,...,...,...,...,...,...,...,...
1875,12,Castellón,072,Llucena/Lucena del Cid,lucenadelcid,none,Valencia,40.13850,-0.279997,564.10640,12072
1945,12,Castellón,901,"Alqueries, les/Alquerías del Niño Perdido",alqueriasdelninoperdido,none,Valencia,39.89653,-0.113446,25.46689,12901
6997,46,Valencia,013,Alboraia/Alboraya,alboraya,none,Valencia,39.49935,-0.349783,10.95999,46013
7160,46,Valencia,176,Montroi/Montroy,montroy,none,Valencia,39.34037,-0.614546,142.12200,46176


In [50]:
merged_df = merged_df[['cmun', 'municipality', 'municipality_1', 'municipality_2','province',  
       'autonomous_community', 'latitude', 'longitude', 'altitude', ]]

In [51]:
merged_df.isna().sum()

cmun                       0
municipality               0
municipality_1             0
municipality_2          7990
province                   0
autonomous_community       0
latitude                   0
longitude                  0
altitude                   0
dtype: int64

In [52]:
merged_df[merged_df["municipality"] == "Donostia"]

Unnamed: 0,cmun,municipality,municipality_1,municipality_2,province,autonomous_community,latitude,longitude,altitude


In [53]:
merged_df.to_csv("../data/processed/filtered_files/filtered_municipalities.csv", index=False)