## Geographical Data and Main Information of Municipalities

### Main notes

* We have had a lot of problems when matching different languages
* From the original dataset, we have lost 97 municipalities that we weren't able to match with coordinates. 
* We have dropped Population data from this dataset, as it will be covered in a different notebook (demographics)

In [360]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [361]:
df_municipalities = pd.read_csv(
    "../data/raw/population_towns.csv", 
    encoding="ISO-8859-1",  # or "latin1",
    sep=";",
    decimal=",")
df_municipalities_coordinates = pd.read_csv(
    "../data/raw/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)

In [362]:
df_municipalities.head()

Unnamed: 0,CPRO,PROVINCIA,CMUN,NOMBRE,POB24,HOMBRES,MUJERES
0,1,Álava,1,Alegría-Dulantzi,2971,1531,1440
1,1,Álava,2,Amurrio,10330,5149,5181
2,1,Álava,3,Aramaio,1381,709,672
3,1,Álava,4,Artziniega,1856,913,943
4,1,Álava,6,Armiñón,247,127,120


In [363]:
df_municipalities_coordinates.head()

Unnamed: 0,Comunidad,Provincia,Población,Latitud,Longitud,Altitud,Habitantes,Hombres,Mujeres
0,Andalucía,Almería,Abla,37.14114,-2.780104,871.1684,1504,783,721
1,Andalucía,Almería,Abrucena,37.13305,-2.797098,976.9387,1341,682,659
2,Andalucía,Almería,Adra,36.74807,-3.022522,10.97898,24373,12338,12035
3,Andalucía,Almería,Albánchez,37.2871,-2.181163,481.3123,815,422,393
4,Andalucía,Almería,Alboloduy,37.03319,-2.62175,388.4346,674,334,340


In [364]:
df_municipalities.rename(columns={
                          "PROVINCIA": "province",
                          "NOMBRE": "municipality",
                          "POB24": "population",
                          "MUJERES": "female",
                          "HOMBRES": "male" }, inplace=True)

In [365]:
df_municipalities_coordinates.rename(columns={"Comunidad": "autonomous_community",
                          "Provincia": "province",
                            "Población": "municipality",
                          "Habitantes": "population",
                            "Hombres": "male",
                            "Mujeres": "female",
                          "Latitud": "latitude",
                          "Longitud": "longitude",
                          "Altitud": "altitude" }, inplace=True)

In [366]:
df_municipalities = df_municipalities.drop(["population", "female", "male"], axis = 1)
df_municipalities_coordinates = df_municipalities_coordinates.drop(["population", "female", "male"], axis = 1)

In [367]:
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Vizcaya', 'Bizkaia')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Alicante/Alacant', 'Alicante')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Castellón/Castelló', 'Castellón')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Valencia/València', 'Valencia')
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'province', 'Guipúzcoa', 'Gipuzkoa')

In [368]:
df_municipalities_coordinates = replace_with(df_municipalities_coordinates, 'autonomous_community', 'Catalunya', 'Cataluña')

In [369]:
cleaner = AccentCleaner([df_municipalities, df_municipalities_coordinates], ['municipality', 'province'])
cleaner.cleanAccents()

In [370]:
# aligner = ColumnAligner(df_municipalities, df_municipalities_coordinates, 'municipality_clean', re.compile(r"\*,\s"))
# aligner.alignColumns()


In [371]:
df_municipalities['municipality_clean'] = df_municipalities['municipality_clean'].str.replace(" ","")
df_municipalities['municipality_clean'] = df_municipalities['municipality_clean'].str.replace(",","")
df_municipalities['municipality_clean'] = df_municipalities['municipality_clean'].str.replace("(","")
df_municipalities['municipality_clean'] = df_municipalities['municipality_clean'].str.replace(")","")
df_municipalities['municipality_clean'] = df_municipalities['municipality_clean'].str.replace("-","")
df_municipalities['municipality_clean'] = df_municipalities['municipality_clean'].str.replace("ñ","n")

In [372]:
df_municipalities_coordinates['municipality_clean'] = df_municipalities_coordinates['municipality_clean'].str.replace(" ","")
df_municipalities_coordinates['municipality_clean'] = df_municipalities_coordinates['municipality_clean'].str.replace(",","")
df_municipalities_coordinates['municipality_clean'] = df_municipalities_coordinates['municipality_clean'].str.replace("(","")
df_municipalities_coordinates['municipality_clean'] = df_municipalities_coordinates['municipality_clean'].str.replace(")","")
df_municipalities_coordinates['municipality_clean'] = df_municipalities_coordinates['municipality_clean'].str.replace("-","")
df_municipalities_coordinates['municipality_clean'] = df_municipalities_coordinates['municipality_clean'].str.replace("ñ","n")

In [373]:
df_municipalities_coordinates["municipality_clean"] = df_municipalities_coordinates["municipality_clean"].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces
df_municipalities["municipality_clean"] = df_municipalities["municipality_clean"].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces


In [374]:
df_municipalities[df_municipalities["municipality_clean"].str.contains("/", na=False)]


Unnamed: 0,CPRO,province,CMUN,municipality,municipality_clean,province_clean
7,1,Álava,10,Ayala/Aiara,ayala/aiara,alava
8,1,Álava,11,Baños de Ebro/Mañueta,banosdeebro/manueta,alava
12,1,Álava,17,Campezo/Kanpezu,campezo/kanpezu,alava
16,1,Álava,21,Elburgo/Burgelu,elburgo/burgelu,alava
18,1,Álava,23,Elvillar/Bilar,elvillar/bilar,alava
...,...,...,...,...,...,...
7162,46,Valencia,178,Nàquera/Náquera,nàquera/naquera,valencia
7204,46,Valencia,220,Sagunt/Sagunto,sagunt/sagunto,valencia
7239,46,Valencia,255,Vilallonga/Villalonga,vilallonga/villalonga,valencia
7497,48,Bizkaia,22,Karrantza Harana/Valle de Carranza,karrantzaharana/valledecarranza,bizkaia


In [375]:
#split municipality names into two columns
df_municipalities_coordinates[['mun_1', 'mun_2']] = df_municipalities_coordinates['municipality_clean'].str.split('/', n=1, expand=True)
df_municipalities[['mun_1', 'mun_2']] = df_municipalities['municipality_clean'].str.split('/', n=1, expand=True)

In [376]:
df_1 = df_municipalities.merge(df_municipalities_coordinates, on='mun_1', how='inner')

In [377]:
df_2 = df_municipalities.merge(df_municipalities_coordinates, left_on='mun_1', right_on='mun_2', how='inner')

In [378]:
df_3 = df_municipalities.merge(df_municipalities_coordinates, left_on='mun_2', right_on='mun_1', how='inner')

In [379]:
df = pd.concat([df_1, df_2, df_3]).drop_duplicates()

In [380]:
df.columns

Index(['CPRO', 'province_x', 'CMUN', 'municipality_x', 'municipality_clean_x',
       'province_clean_x', 'mun_1', 'mun_2_x', 'autonomous_community',
       'province_y', 'municipality_y', 'latitude', 'longitude', 'altitude',
       'municipality_clean_y', 'province_clean_y', 'mun_2_y', 'mun_1_x',
       'mun_1_y'],
      dtype='object')

In [381]:
df = df[['CMUN','CPRO', 'province_x',  'municipality_x',
       'province_clean_x', 'autonomous_community',
        'latitude', 'longitude', 'altitude']]

In [382]:
df.rename(columns={
                          "province_x": "province",
                          "municipality_x": "municipality", }, inplace=True)

In [383]:
df.columns

Index(['CMUN', 'CPRO', 'province', 'municipality', 'province_clean_x',
       'autonomous_community', 'latitude', 'longitude', 'altitude'],
      dtype='object')

In [384]:
df_unmatched = df_municipalities[~df_municipalities['municipality'].isin(df['municipality'])]
df_unmatched

Unnamed: 0,CPRO,province,CMUN,municipality,municipality_clean,province_clean,mun_1,mun_2
5,1,Álava,8,Arratzua-Ubarrundia,arratzuaubarrundia,alava,arratzuaubarrundia,
43,1,Álava,58,Legutio,legutio,alava,legutio,
138,3,Alicante,1,"Atzúbia, l'",atzubial',alicante,atzubial',
144,3,Alicante,7,Alcosser,alcosser,alicante,alcosser,
164,3,Alicante,27,Beniardà,beniardà,alicante,beniardà,
...,...,...,...,...,...,...,...,...
7484,48,Bizkaia,9,Arrankudiaga-Zollo,arrankudiagazollo,bizkaia,arrankudiagazollo,
7560,48,Bizkaia,85,Sopela,sopela,bizkaia,sopela,
7588,48,Bizkaia,916,Usansolo,usansolo,bizkaia,usansolo,
7638,49,Zamora,54,Corrales del Vino,corralesdelvino,zamora,corralesdelvino,


In [385]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8072 entries, 0 to 29
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CMUN                  8072 non-null   int64  
 1   CPRO                  8072 non-null   int64  
 2   province              8072 non-null   object 
 3   municipality          8072 non-null   object 
 4   province_clean_x      8072 non-null   object 
 5   autonomous_community  8072 non-null   object 
 6   latitude              8072 non-null   float64
 7   longitude             8072 non-null   float64
 8   altitude              8072 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 630.6+ KB


In [386]:
df.isna().sum()

CMUN                    0
CPRO                    0
province                0
municipality            0
province_clean_x        0
autonomous_community    0
latitude                0
longitude               0
altitude                0
dtype: int64

In [387]:
def get_zeros(stringlength, fill_length):
    if stringlength < fill_length:
        return '0'* (fill_length - stringlength)
    return None


df["CPRO"] = df["CPRO"].astype(str)
df["CMUN"] = df["CMUN"].astype(str)

df["CPRO"] = df["CPRO"].apply(lambda x: '0' + x if len(x) < 2 else x )
df["CMUN"] = df["CMUN"].apply(lambda x: get_zeros(len(x), 3) + x if len(x) < 3 else x)

df['cmun'] = df["CPRO"] + df["CMUN"]
df.cmun = df.cmun.astype("int32")

In [388]:
df

Unnamed: 0,CMUN,CPRO,province,municipality,province_clean_x,autonomous_community,latitude,longitude,altitude,cmun
0,001,01,Álava,Alegría-Dulantzi,alava,País Vasco,42.84149,-2.513507,561.68570,1001
1,002,01,Álava,Amurrio,alava,País Vasco,43.05265,-3.001022,219.69100,1002
2,003,01,Álava,Aramaio,alava,País Vasco,43.05400,-2.566000,381.87970,1003
3,004,01,Álava,Artziniega,alava,País Vasco,43.12220,-3.128209,196.98080,1004
4,006,01,Álava,Armiñón,alava,País Vasco,42.72305,-2.872574,463.58150,1006
...,...,...,...,...,...,...,...,...,...,...
23,901,31,Navarra,Barañáin/Barañain,navarra,Navarra,42.80492,-1.685519,436.32000,31901
24,013,46,Valencia,Alboraia/Alboraya,valencia,Valencia,39.49935,-0.349783,10.95999,46013
26,176,46,Valencia,Montroi/Montroy,valencia,Valencia,39.34037,-0.614546,142.12200,46176
27,178,46,Valencia,Nàquera/Náquera,valencia,Valencia,39.65884,-0.425707,228.70290,46178


In [389]:
df = df.drop(columns=['CPRO', 'CMUN', 'province_clean_x'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8072 entries, 0 to 29
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   province              8072 non-null   object 
 1   municipality          8072 non-null   object 
 2   autonomous_community  8072 non-null   object 
 3   latitude              8072 non-null   float64
 4   longitude             8072 non-null   float64
 5   altitude              8072 non-null   float64
 6   cmun                  8072 non-null   int32  
dtypes: float64(3), int32(1), object(3)
memory usage: 473.0+ KB


In [390]:
df = df[['cmun', 'municipality', 'province',  'autonomous_community','latitude', 'longitude', 'altitude']]

In [391]:
df

Unnamed: 0,cmun,municipality,province,autonomous_community,latitude,longitude,altitude
0,1001,Alegría-Dulantzi,Álava,País Vasco,42.84149,-2.513507,561.68570
1,1002,Amurrio,Álava,País Vasco,43.05265,-3.001022,219.69100
2,1003,Aramaio,Álava,País Vasco,43.05400,-2.566000,381.87970
3,1004,Artziniega,Álava,País Vasco,43.12220,-3.128209,196.98080
4,1006,Armiñón,Álava,País Vasco,42.72305,-2.872574,463.58150
...,...,...,...,...,...,...,...
23,31901,Barañáin/Barañain,Navarra,Navarra,42.80492,-1.685519,436.32000
24,46013,Alboraia/Alboraya,Valencia,Valencia,39.49935,-0.349783,10.95999
26,46176,Montroi/Montroy,Valencia,Valencia,39.34037,-0.614546,142.12200
27,46178,Nàquera/Náquera,Valencia,Valencia,39.65884,-0.425707,228.70290


In [393]:
df.to_csv("../data/processed/filtered_municipalities.csv", index=False)