In [243]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [244]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [245]:
df_raw.head()

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
0,National Total,,,Males,All ages,2024,23.826.871
1,National Total,,,Males,All ages,2023,23.565.593
2,National Total,,,Males,All ages,2022,23.288.747
3,National Total,,,Males,All ages,2021,23.248.611
4,National Total,,,Males,0 years old,2024,164.763


In [246]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018440 entries, 0 to 10018439
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   National Total  object
 1   Provinces       object
 2   Municipalities  object
 3   Sex             object
 4   Age             object
 5   Periodo         int64 
 6   Total           object
dtypes: int64(1), object(6)
memory usage: 535.0+ MB


In [247]:
df_raw.sample(15)

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
1054694,National Total,"07 Balears, Illes","07059 Salines, Ses",Total,2 years old,2022,40
7075107,National Total,38 Santa Cruz de Tenerife,38035 San Miguel de Abona,Males,95 years old,2021,1
9718529,National Total,50 Zaragoza,50053 Boquiñeni,Total,93 years old,2023,2
5913084,National Total,31 Navarra,31182 Nazar,Total,86 years old,2024,1
9814438,National Total,50 Zaragoza,50136 Lécera,Males,100 years or more,2022,2
1056186,National Total,"07 Balears, Illes",07060 Sineu,Total,69 years old,2022,45
7710010,National Total,42 Soria,42088 Fuentelmonge,Males,7 years old,2022,0
8224388,National Total,44 Teruel,44148 Mezquita de Jarque,Males,82 years old,2024,0
5015849,National Total,"26 Rioja, La",26022 Azofra,Total,75 years old,2023,4
5962866,National Total,31 Navarra,31224 Sesma,Females,87 years old,2022,5


In [248]:
df_raw.drop(columns=['National Total'], inplace=True)

In [249]:
df_raw.isnull().sum()

Provinces          1224
Municipalities    64872
Sex                   0
Age                   0
Periodo               0
Total               918
dtype: int64

In [250]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1064880,08 Barcelona,,Males,All ages,2024,2.870.721
1064881,08 Barcelona,,Males,All ages,2023,2.830.260
1064882,08 Barcelona,,Males,All ages,2022,2.783.698
1064883,08 Barcelona,,Males,All ages,2021,2.785.890
1064884,08 Barcelona,,Males,0 years old,2024,20.185
...,...,...,...,...,...,...
1066099,08 Barcelona,,Total,99 years old,2021,1.096
1066100,08 Barcelona,,Total,100 years or more,2024,1.982
1066101,08 Barcelona,,Total,100 years or more,2023,1.807
1066102,08 Barcelona,,Total,100 years or more,2022,1.707


### We have information about provinces without the municipalities, we should drop them

In [251]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
4799189,25 Lleida,25086 Esterri d'Àneu,Total,72 years old,2023,10
4745293,25 Lleida,25036 Aspa,Total,62 years old,2023,2
4769566,25 Lleida,"25057 Bòrdes, Es",Total,10 years old,2022,1
4707036,25 Lleida,25002 Àger,Females,86 years old,2024,0
4770702,25 Lleida,"25058 Borges Blanques, Les",Females,90 years old,2022,16


In [252]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [253]:

prov_null[(prov_null["Provinces"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [254]:
df_raw[df_raw['Total'].isnull()].sample(30)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
9348867,48 Bizkaia,48916 NA,Total,89 years old,2021,
9347794,48 Bizkaia,48916 NA,Males,25 years old,2022,
9347707,48 Bizkaia,48916 NA,Males,3 years old,2021,
9347897,48 Bizkaia,48916 NA,Males,51 years old,2023,
9348126,48 Bizkaia,48916 NA,Females,6 years old,2022,
9347975,48 Bizkaia,48916 NA,Males,70 years old,2021,
9348211,48 Bizkaia,48916 NA,Females,27 years old,2021,
9348526,48 Bizkaia,48916 NA,Total,4 years old,2022,
9348614,48 Bizkaia,48916 NA,Total,26 years old,2022,
9348702,48 Bizkaia,48916 NA,Total,48 years old,2022,


In [255]:
df_raw.sample(20)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1782859,09 Burgos,09362 Sarracín,Females,75 years old,2021,1.0
1021396,"07 Balears, Illes",07032 Maó,Females,42 years old,2024,247.0
6440152,34 Palencia,34223 Villamoronta,Females,69 years old,2024,1.0
8754572,46 Valencia/València,46121 Estubeny,Females,28 years old,2024,0.0
2116100,10 Cáceres,10175 Serradilla,Total,52 years old,2024,21.0
4389143,23 Jaén,23056 Lopera,Total,70 years old,2021,39.0
2492837,13 Ciudad Real,13076 Santa Cruz de los Cáñamos,Females,90 years old,2023,2.0
4530631,24 León,24076 Garrafe de Torío,Females,48 years old,2021,13.0
4575092,24 León,"24114 Pola de Gordón, La",Total,46 years old,2024,30.0
3555473,19 Guadalajara,19041 Armuña de Tajuña,Total,39 years old,2023,1.0


In [256]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [257]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [258]:
df_2024.reset_index(drop=True, inplace=True)

In [259]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [260]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [261]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [262]:
(df_2024["age"].str.contains(r"^\d"))

1          True
2          True
3          True
4          True
5          True
           ... 
2488387    True
2488388    True
2488389    True
2488390    True
2488391    True
Name: age, Length: 2463996, dtype: bool

In [263]:
from scripts.utils import split_column_at

df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)

In [264]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [265]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [266]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
2110082,Toledo,Noez,Total,7,2024,10,45,45116,noez,toledo
175293,Ávila,San,Total,56,2024,9,5,5211,san,avila
1475053,Navarra,Piedramillera,Females,30,2024,0,31,31204,piedramillera,navarra
931924,Guadalajara,Puebla,Females,51,2024,0,19,19229,puebla,guadalajara
796887,Girona,Sant,Males,62,2024,144,17,17160,sant,girona
1176747,Lleida,Arres,Females,74,2024,0,25,25031,arres,lleida
281420,Barcelona,Castellet,Total,1,2024,19,8,8058,castellet,barcelona
100067,Almería,Huércal-Overa,Males,4,2024,80,4,4053,huercal-overa,almeria
1398464,Málaga,Montecorto,Males,43,2024,5,29,29903,montecorto,malaga
1355808,Madrid,Torrejón,Total,23,2024,1470,28,28148,torrejon,madrid


In [267]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [268]:
df_2024[df_2024['cmun'] == 1902]

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean


In [269]:
df_2024

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,01,01001,alegria-dulantzi,araba/alava
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,01,01001,alegria-dulantzi,araba/alava
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,01,01001,alegria-dulantzi,araba/alava
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,01,01001,alegria-dulantzi,araba/alava
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,01,01001,alegria-dulantzi,araba/alava
...,...,...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,52,52001,melilla,melilla
2488388,Melilla,Melilla,Total,97,2024,13,52,52001,melilla,melilla
2488389,Melilla,Melilla,Total,98,2024,11,52,52001,melilla,melilla
2488390,Melilla,Melilla,Total,99,2024,3,52,52001,melilla,melilla


In [270]:
df_2024["cmun"] = df_2024.apply(lambda row: str(row["cmun"])[len(str(row["cprov"])):], axis=1)


In [271]:
df_2024

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,01,001,alegria-dulantzi,araba/alava
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,01,001,alegria-dulantzi,araba/alava
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,01,001,alegria-dulantzi,araba/alava
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,01,001,alegria-dulantzi,araba/alava
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,01,001,alegria-dulantzi,araba/alava
...,...,...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,52,001,melilla,melilla
2488388,Melilla,Melilla,Total,97,2024,13,52,001,melilla,melilla
2488389,Melilla,Melilla,Total,98,2024,11,52,001,melilla,melilla
2488390,Melilla,Melilla,Total,99,2024,3,52,001,melilla,melilla


In [272]:
df_2024["cprov"] = pd.to_numeric(df_2024["cprov"].str.replace(".", "", regex=False)).astype("int32")
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [286]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
2190388,Valencia/València,Montesa,Males,39,2024,6,46,174,montesa,valencia/valencia
1842742,Segovia,San,Males,9,2024,1,40,184,san,segovia
978827,Gipuzkoa,Oñati,Total,34,2024,105,20,59,onati,gipuzkoa
967072,Gipuzkoa,Belauntza,Females,9,2024,2,20,21,belauntza,gipuzkoa
698799,Cuenca,Castillo-Albaráñez,Females,98,2024,0,16,71,castillo-albaranez,cuenca
418236,Burgos,Pampliega,Total,35,2024,1,9,250,pampliega,burgos
969434,Gipuzkoa,Deba,Males,25,2024,29,20,29,deba,gipuzkoa
1734241,Salamanca,Vallejera,Females,36,2024,0,37,343,vallejera,salamanca
1083364,Jaén,Fuensanta,Females,21,2024,13,23,34,fuensanta,jaen
312456,Barcelona,Perafita,Males,29,2024,2,8,160,perafita,barcelona


In [287]:
df_2024.to_csv("../data/processed/filtered_age.csv", index=False)