## Geographical Data and Main Information of Municipalities

### Main notes

* We have had a lot of problems when matching different languages
* From the original dataset, we have lost 97 municipalities that we weren't able to match with coordinates. 
* We have dropped Population data from this dataset, as it will be covered in a different notebook (demographics)

In [155]:
import sys
import os
import re
from unidecode import unidecode

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [156]:
df_municipalities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_municipalities_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [157]:
df_municipalities.shape

(8132, 7)

In [158]:
df_municipalities_coordinates.shape

(8112, 9)

In [159]:
df_municipalities.rename(columns={
                          "PROVINCIA": "province",
                          "NOMBRE": "municipality",
                          "POB24": "population",
                          "MUJERES": "female",
                          "HOMBRES": "male" }, inplace=True)

In [160]:
df_municipalities_coordinates.rename(columns={"Comunidad": "autonomous_community",
                          "Provincia": "province",
                            "Población": "municipality",
                          "Habitantes": "population",
                            "Hombres": "male",
                            "Mujeres": "female",
                          "Latitud": "latitude",
                          "Longitud": "longitude",
                          "Altitud": "altitude" }, inplace=True)

In [161]:
df_municipalities = df_municipalities.drop(["population", "female", "male"], axis = 1)
df_municipalities_coordinates = df_municipalities_coordinates.drop(["population", "female", "male"], axis = 1)

In [162]:
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Vizcaya', 'Bizkaia')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Alicante/Alacant', 'Alicante')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Castellón/Castelló', 'Castellón')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Valencia/València', 'Valencia')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Guipúzcoa', 'Gipuzkoa')

In [163]:
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'autonomous_community', 'Catalunya', 'Cataluña')

# Split columns and clean

In [164]:
# Split municipality names into two columns
df_municipalities[['municipality_1', 'municipality_2']] = df_municipalities['municipality'].str.split('/', n=1, expand=True)

# Fill missing values in municipality_2 with "none" (as a string)
df_municipalities["municipality_2"] = df_municipalities["municipality_2"].fillna("none")


In [165]:
#split municipality names into two columns
df_municipalities_coordinates[['municipality_1', 'municipality_2']] = df_municipalities_coordinates['municipality'].str.split('/', n=1, expand=True)

# Fill missing values in municipality_2 with "none" (as a string)
df_municipalities_coordinates["municipality_2"] = df_municipalities_coordinates["municipality_2"].fillna("none")

In [166]:
# Function to clean a column (remove accents, normalize case)
def clean_column(column):
    return column.apply(lambda x: unidecode(str(x)).strip().lower() if pd.notna(x) else x)

# Apply cleaning to df_municipalities
df_municipalities["municipality_1"] = clean_column(df_municipalities["municipality_1"])
df_municipalities["municipality_2"] = clean_column(df_municipalities["municipality_2"])

# Apply cleaning to df_municipalities_coordinates
df_municipalities_coordinates["municipality_1"] = clean_column(df_municipalities_coordinates["municipality_1"])
df_municipalities_coordinates["municipality_2"] = clean_column(df_municipalities_coordinates["municipality_2"])

In [167]:
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace(" ","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace(",","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace("(","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace(")","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace("-","")
df_municipalities['municipality_1'] = df_municipalities['municipality_1'].str.replace("ñ","n")

df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace(" ","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace(",","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace("(","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace(")","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace("-","")
df_municipalities['municipality_2'] = df_municipalities['municipality_2'].str.replace("ñ","n")

In [168]:
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace(" ","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace(",","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace("(","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace(")","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace("-","")
df_municipalities_coordinates['municipality_1'] = df_municipalities_coordinates['municipality_1'].str.replace("ñ","n")

df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace(" ","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace(",","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace("(","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace(")","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace("-","")
df_municipalities_coordinates['municipality_2'] = df_municipalities_coordinates['municipality_2'].str.replace("ñ","n")

# Merging DFs

In [169]:
df_municipalities.loc[df_municipalities["municipality_2"].isna(), "language"] = "language"
df_municipalities_coordinates.loc[df_municipalities_coordinates["municipality_2"].isna(), "language"] = "language"

In [170]:
merge_1 = df_municipalities.merge(df_municipalities_coordinates, on = "municipality_1", how="left")

In [171]:
merge_1 = merge_1.dropna(subset=['latitude'])

In [172]:
merge_2 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_1", right_on="municipality_2", how="left")

In [173]:
merge_2 = merge_2.dropna(subset=['latitude'])

In [174]:
merged_df = pd.concat([merge_1, merge_2])

In [176]:
merged_df.shape

(8069, 17)

In [177]:
merged_df.isna().sum()

CPRO                       0
province_x                 0
CMUN                       0
municipality_x             0
municipality_1            28
municipality_2_x           0
language_x              8069
autonomous_community       0
province_y                 0
municipality_y             0
latitude                   0
longitude                  0
altitude                   0
municipality_2_y           0
language_y              8069
municipality_1_x        8041
municipality_1_y        8041
dtype: int64

In [141]:
merged_df.shape

(8069, 17)

In [178]:
merge_3 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_2", right_on="municipality_1", how="left")

In [179]:
merge_3 = merge_3.dropna(subset=['latitude'])

In [180]:
merged_df = pd.concat([merged_df, merge_3])

In [181]:
merged_df.shape

(8099, 17)

In [182]:
merged_df.isna().sum()

CPRO                       0
province_x                 0
CMUN                       0
municipality_x             0
municipality_1            58
municipality_2_x           0
language_x              8099
autonomous_community       0
province_y                 0
municipality_y             0
latitude                   0
longitude                  0
altitude                   0
municipality_2_y           0
language_y              8099
municipality_1_x        8041
municipality_1_y        8041
dtype: int64

In [185]:
df_municipalities_coordinates = df_municipalities_coordinates[df_municipalities_coordinates["municipality_2"] != "none"]
df_municipalities = df_municipalities[df_municipalities["municipality_2"] != "none"]


In [109]:
# Perform merges on all combinations
merge_4 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_2", right_on="municipality_2", how="left")

KeyboardInterrupt: 

In [None]:
merge_4 = merge_4.dropna(subset=['latitude'])

In [None]:
# Concatenate results and drop duplicates
merged_df = pd.concat([merge_1, merge_2, merge_3, merge_4])

In [None]:
merged_df.shape

(32042, 16)

In [452]:
merged_df.drop_duplicates().shape

(24073, 16)

In [438]:
merged_df.shape

(32664, 16)

In [None]:
# Perform merges on all combinations
# merge_1 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_1_clean", right_on="municipality_1_clean", how="left")

In [324]:
# merge_2 = merge_1.merge(df_municipalities_coordinates, left_on="municipality_2_clean_y", right_on="municipality_2_clean", how="left")

In [325]:
# merge_3 = merge_2.merge(df_municipalities_coordinates, left_on="municipality_1_clean_x", right_on="municipality_2_clean", how="left")

In [326]:
# merge_4 = df_municipalities.merge(df_municipalities_coordinates, left_on="municipality_2", right_on="municipality_2", how="outer")

In [327]:
# merged_df = pd.concat([merge_1, merge_2, merge_3, merge_4])

In [328]:
# merged_df = merged_df.drop_duplicates(subset=['municipality_1_clean'], keep='first')

In [329]:
# # Show all rows and columns
# pd.set_option('display.max_rows', None)  # Show all rows
# pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.expand_frame_repr', False)  # Prevent column wrapping


# merged_df[merged_df["latitude"].isna()]

In [330]:
# df_municipalities_coordinates = df_municipalities_coordinates.drop_duplicates(subset=['municipality_1'], keep='first')

In [331]:

# Concatenate results and drop duplicates
# merged_df = pd.concat([merge_1, merge_2, merge_3, merge_4]).drop_duplicates().reset_index(drop=True)

# One by one

In [386]:
language_provinces = ['Navarra', 'Álava', 'Gipuzkoa', 'Bizkaia', 'Alicante', 'Castellón', 'Valencia']

In [339]:
# df_municipalities_coordinates.loc[(df_municipalities_coordinates["language"]) & (df_municipalities_coordinates["province"] == "Navarra")]

In [340]:
# df_municipalities_coordinates.loc[(df_municipalities_coordinates["language"]) & (df_municipalities_coordinates["province"] == "Álava")]

In [341]:
# df_municipalities_coordinates.loc[(df_municipalities_coordinates["language"]) & (df_municipalities_coordinates["province"] == "Gipuzkoa")]

In [342]:
# df_municipalities_coordinates.loc[(df_municipalities_coordinates["language"]) & (df_municipalities_coordinates["province"] == "Bizkaia")]

In [343]:
# df_municipalities_coordinates.loc[(df_municipalities_coordinates["language"]) & (df_municipalities_coordinates["province"] == "Alicante")]

In [387]:
df_municipalities_coordinates.loc[(df_municipalities_coordinates["province"] == "Castellón")]

Unnamed: 0,autonomous_community,province,municipality,latitude,longitude,altitude,municipality_1,municipality_2
7711,Valencia,Castellón,Aín,39.90086,-0.3408,497.9523,ain,
7712,Valencia,Castellón,Albocàsser,40.35626,0.025362,534.062,albocasser,
7713,Valencia,Castellón,Alcalà de Xivert,40.30459,0.226305,159.6516,alcaladexivert,
7714,Valencia,Castellón,Alcora (l'),40.07278,-0.213024,274.3336,alcoral',
7715,Valencia,Castellón,Alcudia de Veo,39.91713,-0.355633,476.4656,alcudiadeveo,
7716,Valencia,Castellón,Alfondeguilla,39.83725,-0.26879,217.8936,alfondeguilla,
7717,Valencia,Castellón,Algimia de Almonacid,39.91433,-0.443457,496.8173,algimiadealmonacid,
7718,Valencia,Castellón,Almazora/Almassora,39.94497,-0.063318,37.78706,almazora,almassora
7719,Valencia,Castellón,Almedíjar,39.87085,-0.410406,400.144,almedijar,
7720,Valencia,Castellón,Almenara,39.7544,-0.222755,39.3216,almenara,


In [None]:
municipalities_to_change = [
    
    # Álava
        "Iruña Oka", "Erriberagoitia", 
    # Gipuzkoa
        "Arrasate", "Soraluze", 
    # Bizkaia
        "Karrantza Harana", "Urduña"
    # Alicante
        "Fondó de les Neus (el)", "Pinós (el)", 
    #
        "Montitxelvo", "Peníscola", "Borriana", "Sant Jordi","Harana", 
     # Navarra
        'Abaurregaina', 'Abaurrepea', 'Altsasu', 
       'Auritz', 'Bera', 
       'Doneztebe', 'Hiriberri',
       'Luzaide', 'Olazti',
        'Orreaga' ]

In [21]:
# df_municipalities_coordinates.loc[(df_municipalities_coordinates["language"]) & (df_municipalities_coordinates["province"] == "Valencia")]

In [22]:
municipalities_to_change = ["Iruña Oka", "Harana", "Erriberagoitia", "Fondó de les Neus (el)", "Pinós (el)", "Montitxelvo", "Peníscola", "Borriana", "Sant Jordi", "Arrasate", "Soraluze", "Karrantza Harana", "Urduña"
                            'Abaurregaina', 'Abaurrepea', 'Aibar', 'Altsasu', 'Aoiz', 'Arce',
       'Auritz', 'Bera', 'Burgui',
       'Doneztebe', 'Esparza de Salazar', 'Hiriberri',
       'Luzaide', 'Olazti',
        'Orreaga' ]

# Assign "change" to the new column where language=True and municipality_1 is in the list
df_municipalities_coordinates.loc[
    (df_municipalities_coordinates["language"]) & 
    (df_municipalities_coordinates["municipality_spa"].isin(municipalities_to_change)), 
    "change"
] = "change"


In [23]:
df_municipalities_coordinates.loc[df_municipalities_coordinates["change"] == "change", ["municipality_spa", "municipality_2"]] = df_municipalities_coordinates.loc[df_municipalities_coordinates["change"] == "change", ["municipality_2", "municipality_spa"]].values


After all of this changes --> municipality_1 is in spanish / municipality_2 is in the province language

___

Let's do the same with the towns db

In [24]:
df_municipalities.head()

Unnamed: 0,CPRO,province,CMUN,municipality
0,1,Álava,1,Alegría-Dulantzi
1,1,Álava,2,Amurrio
2,1,Álava,3,Aramaio
3,1,Álava,4,Artziniega
4,1,Álava,6,Armiñón


In [25]:
df_municipalities[['municipality_spa', 'municipality_2']] = df_municipalities['municipality'].str.split('/', n=1, expand=True)
df_municipalities.head()

Unnamed: 0,CPRO,province,CMUN,municipality,municipality_spa,municipality_2
0,1,Álava,1,Alegría-Dulantzi,Alegría-Dulantzi,
1,1,Álava,2,Amurrio,Amurrio,
2,1,Álava,3,Aramaio,Aramaio,
3,1,Álava,4,Artziniega,Artziniega,
4,1,Álava,6,Armiñón,Armiñón,


In [26]:
df_municipalities["language"] = df_municipalities["municipality_2"].notna()
df_municipalities.head()

Unnamed: 0,CPRO,province,CMUN,municipality,municipality_spa,municipality_2,language
0,1,Álava,1,Alegría-Dulantzi,Alegría-Dulantzi,,False
1,1,Álava,2,Amurrio,Amurrio,,False
2,1,Álava,3,Aramaio,Aramaio,,False
3,1,Álava,4,Artziniega,Artziniega,,False
4,1,Álava,6,Armiñón,Armiñón,,False


In [27]:
df_municipalities[df_municipalities["municipality_2"].notna()]["province"].unique()

array(['Álava', 'Alicante', 'Castellón', 'Gipuzkoa', 'Navarra',
       'Valencia', 'Bizkaia'], dtype=object)

In [28]:
municipalities_to_change = [
    
#álava
"Iruña Oka", "Harana",
     
#alicante
'Alcoi', 'Alacant', 'Poble Nou de Benitatxell, el',
       'Camp de Mirra, el', 'Elx', 'Fondó de les Neus, el', 'Xàbia',
       'Xixona', "Orxa, l'", 'Monòver', 'Pinós, el',
       'Sant Vicent del Raspeig', 'Torre de les Maçanes, la',
       'Vila Joiosa, la', 
#castellón
       'Benicàssim', 'Borriana',  'Xodos', 'Llucena', 'Orpesa',
       'Peníscola', 'Sant Jordi', 'Suera', 'Useres, les', 'Vilafranca',
       'Alqueries, les',
#gipuzkoa
       "Arrasate", "Donostia"

#navarra
       'Abaurregaina', 'Abaurrepea', 'Altsasu', 'Auritz', 'Olazti', 'Orreaga',  'Sangüesa', 'Doneztebe',

#valencia
'Alboraia', 'Moixent', 'Montitxelvo', 'Montroi', 'Nàquera',
       'Sagunt', 'Vilallonga'

#Bizkaia
"Karrantza Harana", "Urduña"]


In [29]:
# Assign "change" to the new column where language=True and municipality_1 is in the list
df_municipalities.loc[
    (df_municipalities["language"]) & 
    (df_municipalities["municipality_spa"].isin(municipalities_to_change)), 
    "change"
] = "change"

In [30]:
df_municipalities.loc[df_municipalities["change"] == "change", ["municipality_spa", "municipality_2"]] = df_municipalities.loc[df_municipalities["change"] == "change", ["municipality_2", "municipality_spa"]].values


---

## Now clean and match on municipalities SPA

In [31]:
cleaner = AccentCleaner([df_municipalities, df_municipalities_coordinates], ['municipality_spa'])
cleaner.cleanAccents()

In [32]:
# aligner = ColumnAligner(df_municipalities, df_municipalities_coordinates, 'municipality_clean', re.compile(r"\*,\s"))
# aligner.alignColumns()


In [33]:
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace(" ","")
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace(",","")
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace("(","")
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace(")","")
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace("-","")
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace("ñ","n")

In [34]:
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace(" ","")
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace(",","")
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace("(","")
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace(")","")
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace("-","")
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace("ñ","n")

In [35]:
df_municipalities.head()

Unnamed: 0,CPRO,province,CMUN,municipality,municipality_spa,municipality_2,language,change,municipality_spa_clean
0,1,Álava,1,Alegría-Dulantzi,Alegría-Dulantzi,,False,,alegriadulantzi
1,1,Álava,2,Amurrio,Amurrio,,False,,amurrio
2,1,Álava,3,Aramaio,Aramaio,,False,,aramaio
3,1,Álava,4,Artziniega,Artziniega,,False,,artziniega
4,1,Álava,6,Armiñón,Armiñón,,False,,arminon


In [36]:
df_municipalities_coordinates.head()

Unnamed: 0,autonomous_community,province,municipality,latitude,longitude,altitude,municipality_spa,municipality_2,language,change,municipality_spa_clean
0,Andalucía,Almería,Abla,37.14114,-2.780104,871.1684,Abla,,False,,abla
1,Andalucía,Almería,Abrucena,37.13305,-2.797098,976.9387,Abrucena,,False,,abrucena
2,Andalucía,Almería,Adra,36.74807,-3.022522,10.97898,Adra,,False,,adra
3,Andalucía,Almería,Albánchez,37.2871,-2.181163,481.3123,Albánchez,,False,,albanchez
4,Andalucía,Almería,Alboloduy,37.03319,-2.62175,388.4346,Alboloduy,,False,,alboloduy


In [37]:
df_municipalities_coordinates['municipality_spa_clean'] = df_municipalities_coordinates['municipality_spa_clean'].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces
df_municipalities['municipality_spa_clean'] = df_municipalities['municipality_spa_clean'].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces


In [38]:
df = df_municipalities.merge(df_municipalities_coordinates, on = "municipality_spa_clean", how='left')

In [39]:
df.isna().sum()

CPRO                         0
province_x                   0
CMUN                         0
municipality_x               0
municipality_spa_x           0
municipality_2_x          8040
language_x                   0
change_x                  8124
municipality_spa_clean       0
autonomous_community       117
province_y                 117
municipality_y             117
latitude                   117
longitude                  117
altitude                   117
municipality_spa_y         117
municipality_2_y          8093
language_y                 117
change_y                  8151
dtype: int64

In [40]:
df.duplicated().sum()

0

In [41]:
df.shape

(8166, 19)

In [42]:
df.columns

Index(['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_spa_x',
       'municipality_2_x', 'language_x', 'change_x', 'municipality_spa_clean',
       'autonomous_community', 'province_y', 'municipality_y', 'latitude',
       'longitude', 'altitude', 'municipality_spa_y', 'municipality_2_y',
       'language_y', 'change_y'],
      dtype='object')

In [43]:
df = df[['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_spa_x',
       'autonomous_community',   'latitude',
       'longitude', 'altitude']]

In [44]:
df.rename(columns={      "province_x": "province",
                          "municipality_x": "municipality", 
                          'municipality_spa_x': "municipality_spa"}, inplace=True)

In [45]:
df.columns

Index(['CPRO', 'province', 'CMUN', 'municipality', 'municipality_spa',
       'autonomous_community', 'latitude', 'longitude', 'altitude'],
      dtype='object')

In [46]:
df.shape

(8166, 9)

In [47]:
def get_zeros(stringlength, fill_length):
    if stringlength < fill_length:
        return '0'* (fill_length - stringlength)
    return None


df["CPRO"] = df["CPRO"].astype(str)
df["CMUN"] = df["CMUN"].astype(str)

df["CPRO"] = df["CPRO"].apply(lambda x: '0' + x if len(x) < 2 else x )
df["CMUN"] = df["CMUN"].apply(lambda x: get_zeros(len(x), 3) + x if len(x) < 3 else x)

df['cmun'] = df["CPRO"] + df["CMUN"]
df.cmun = df.cmun.astype("int32")

In [48]:
df

Unnamed: 0,CPRO,province,CMUN,municipality,municipality_spa,autonomous_community,latitude,longitude,altitude,cmun
0,01,Álava,001,Alegría-Dulantzi,Alegría-Dulantzi,País Vasco,42.84149,-2.513507,561.68570,1001
1,01,Álava,002,Amurrio,Amurrio,País Vasco,43.05265,-3.001022,219.69100,1002
2,01,Álava,003,Aramaio,Aramaio,País Vasco,43.05400,-2.566000,381.87970,1003
3,01,Álava,004,Artziniega,Artziniega,País Vasco,43.12220,-3.128209,196.98080,1004
4,01,Álava,006,Armiñón,Armiñón,País Vasco,42.72305,-2.872574,463.58150,1006
...,...,...,...,...,...,...,...,...,...,...
8161,50,Zaragoza,901,Biel,Biel,Aragón,42.38749,-0.936588,754.24450,50901
8162,50,Zaragoza,902,Marracos,Marracos,Aragón,42.09059,-0.776047,404.73610,50902
8163,50,Zaragoza,903,Villamayor de Gállego,Villamayor de Gállego,Aragón,41.68518,-0.773315,231.12340,50903
8164,51,Ceuta,001,Ceuta,Ceuta,Ceuta y Melilla,35.88829,-5.316195,13.47725,51001


In [49]:
df.columns

Index(['CPRO', 'province', 'CMUN', 'municipality', 'municipality_spa',
       'autonomous_community', 'latitude', 'longitude', 'altitude', 'cmun'],
      dtype='object')

In [50]:
df = df[['cmun', 'municipality_spa', 'province',  
       'autonomous_community', 'latitude', 'longitude', 'altitude', 'municipality']]

In [51]:
df.isna().sum()

cmun                      0
municipality_spa          0
province                  0
autonomous_community    117
latitude                117
longitude               117
altitude                117
municipality              0
dtype: int64

In [52]:
df[df["latitude"].isna()]["province"].unique()

array(['Álava', 'Alicante', 'Almería', 'Ávila', 'Badajoz',
       'Illes Balears', 'Barcelona', 'Burgos', 'Cáceres', 'Cádiz',
       'Castellón', 'Córdoba', 'A Coruña', 'Cuenca', 'Girona', 'Granada',
       'Gipuzkoa', 'Huelva', 'Huesca', 'León', 'La Rioja', 'Madrid',
       'Málaga', 'Navarra', 'Pontevedra', 'Santa Cruz de Tenerife',
       'Segovia', 'Sevilla', 'Tarragona', 'Toledo', 'Valencia', 'Bizkaia',
       'Zamora', 'Zaragoza'], dtype=object)

In [53]:
df = df.drop(df[df["latitude"].isna() & df["province"].isin([
    'Almería', 'Ávila', 'Badajoz', 'Illes Balears', 'Barcelona', 'Burgos',
    'Cáceres', 'Cádiz', 'Córdoba', 'A Coruña', 'Cuenca', 'Girona', 'Granada',
    'Huelva', 'Huesca', 'León', 'La Rioja', 'Madrid', 'Málaga', 'Pontevedra',
    'Santa Cruz de Tenerife', 'Segovia', 'Sevilla', 'Tarragona', 'Toledo',
    'Zamora', 'Zaragoza'
])].index)



In [54]:
df.isna().sum()

cmun                     0
municipality_spa         0
province                 0
autonomous_community    63
latitude                63
longitude               63
altitude                63
municipality             0
dtype: int64

In [55]:
df.dropna(subset=["latitude"], inplace=True)


In [62]:
df[df["municipality"] == "Donostia"]

Unnamed: 0,cmun,municipality_spa,province,autonomous_community,latitude,longitude,altitude,municipality


In [156]:
df.to_csv("../data/processed/filtered_municipalities.csv", index=False)