In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [54]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [55]:
df_raw.head()

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
0,National Total,,,Males,All ages,2024,23.826.871
1,National Total,,,Males,All ages,2023,23.565.593
2,National Total,,,Males,All ages,2022,23.288.747
3,National Total,,,Males,All ages,2021,23.248.611
4,National Total,,,Males,0 years old,2024,164.763


In [56]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018440 entries, 0 to 10018439
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   National Total  object
 1   Provinces       object
 2   Municipalities  object
 3   Sex             object
 4   Age             object
 5   Periodo         int64 
 6   Total           object
dtypes: int64(1), object(6)
memory usage: 535.0+ MB


In [57]:
df_raw.sample(15)

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
4812033,National Total,25 Lleida,25100 Gósol,Females,19 years old,2023,1
2776890,National Total,16 Cuenca,16035 Beteta,Total,9 years old,2022,1
4696908,National Total,24 León,24227 Villaturiel,Females,2 years old,2024,5
4872587,National Total,25 Lleida,25156 Os de Balaguer,Total,61 years old,2021,12
6753084,National Total,37 Salamanca,37148 Garcihernández,Males,68 years old,2024,1
4503824,National Total,24 León,24052 Cebanico,Females,79 years old,2024,1
8700766,National Total,46 Valencia/València,46077 Buñol,Females,40 years old,2022,74
1878767,National Total,09 Burgos,09464 Villatuelda,Total,82 years old,2021,1
8024302,National Total,43 Tarragona,"43152 Torre de l'Espanyol, La",Total,40 years old,2022,5
2881154,National Total,16 Cuenca,16133 Mota del Cuervo,Total,65 years old,2022,70


In [58]:
df_raw.drop(columns=['National Total'], inplace=True)

In [59]:
df_raw.isnull().sum()

Provinces          1224
Municipalities    64872
Sex                   0
Age                   0
Periodo               0
Total               918
dtype: int64

In [60]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1064880,08 Barcelona,,Males,All ages,2024,2.870.721
1064881,08 Barcelona,,Males,All ages,2023,2.830.260
1064882,08 Barcelona,,Males,All ages,2022,2.783.698
1064883,08 Barcelona,,Males,All ages,2021,2.785.890
1064884,08 Barcelona,,Males,0 years old,2024,20.185
...,...,...,...,...,...,...
1066099,08 Barcelona,,Total,99 years old,2021,1.096
1066100,08 Barcelona,,Total,100 years or more,2024,1.982
1066101,08 Barcelona,,Total,100 years or more,2023,1.807
1066102,08 Barcelona,,Total,100 years or more,2022,1.707


### We have information about provinces without the municipalities, we should drop them

In [61]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
4960954,25 Lleida,25247 Vilamòs,Males,19 years old,2022,0
4705637,25 Lleida,25001 Abella de la Conca,Females,42 years old,2023,1
4718550,25 Lleida,25012 Alcoletge,Males,6 years old,2022,28
4736426,25 Lleida,25029 Arbeca,Females,89 years old,2022,9
4794606,25 Lleida,"25081 Espluga Calba, L'",Males,48 years old,2022,0


In [62]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [63]:

prov_null[(prov_null["Provinces"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [64]:
df_raw[df_raw['Total'].isnull()].sample(30)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
9348822,48 Bizkaia,48916 NA,Total,78 years old,2022,
9347991,48 Bizkaia,48916 NA,Males,74 years old,2021,
9348726,48 Bizkaia,48916 NA,Total,54 years old,2022,
9348185,48 Bizkaia,48916 NA,Females,21 years old,2023,
9348887,48 Bizkaia,48916 NA,Total,94 years old,2021,
9348269,48 Bizkaia,48916 NA,Females,42 years old,2023,
9348331,48 Bizkaia,48916 NA,Females,57 years old,2021,
9347909,48 Bizkaia,48916 NA,Males,54 years old,2023,
9348457,48 Bizkaia,48916 NA,Females,89 years old,2023,
9348815,48 Bizkaia,48916 NA,Total,76 years old,2021,


In [65]:
df_raw.sample(20)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
6316427,34 Palencia,34084 Herrera de Valdecañas,Females,43 years old,2021,1.0
9648041,49 Zamora,49270 Villavendimio,Females,15 years old,2023,1.0
707244,05 Ávila,05210 San Juan de la Encinilla,Total,44 years old,2024,1.0
9244345,48 Bizkaia,48028 Ea,Females,71 years old,2023,5.0
8191593,44 Teruel,"44118 Ginebrosa, La",Females,43 years old,2023,1.0
9365189,49 Zamora,49014 Arquillinos,Males,90 years old,2023,0.0
9203674,47 Valladolid,47228 Villaverde de Medina,Females,1 year old,2022,5.0
9156315,47 Valladolid,47187 Vega de Ruiponce,Females,95 years old,2021,0.0
2788240,16 Cuenca,16045 Canalejas del Arroyo,Total,93 years old,2024,4.0
3341801,18 Granada,18039 Caniles,Males,69 years old,2023,30.0


In [66]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [67]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [68]:
df_2024.reset_index(drop=True, inplace=True)

In [69]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [70]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [71]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [72]:
(df_2024["age"].str.contains(r"^\d"))

1          True
2          True
3          True
4          True
5          True
           ... 
2488387    True
2488388    True
2488389    True
2488390    True
2488391    True
Name: age, Length: 2463996, dtype: bool

In [73]:
df_2024.head()

Unnamed: 0,province,municipality,sex,age,year,total
1,01 Araba/Álava,01001 Alegría-Dulantzi,Males,0 years old,2024,11
2,01 Araba/Álava,01001 Alegría-Dulantzi,Males,1 year old,2024,9
3,01 Araba/Álava,01001 Alegría-Dulantzi,Males,2 years old,2024,15
4,01 Araba/Álava,01001 Alegría-Dulantzi,Males,3 years old,2024,12
5,01 Araba/Álava,01001 Alegría-Dulantzi,Males,4 years old,2024,9


In [74]:
from scripts.utils import split_column_at
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)

In [75]:
df_2024

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,Araba/Álava,01001
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,Araba/Álava,01001
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,Araba/Álava,01001
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,Araba/Álava,01001
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,Araba/Álava,01001
...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,Melilla,52001
2488388,Melilla,Melilla,Total,97,2024,13,Melilla,52001
2488389,Melilla,Melilla,Total,98,2024,11,Melilla,52001
2488390,Melilla,Melilla,Total,99,2024,3,Melilla,52001


In [77]:
df_2024.drop("cprov", axis = 1, inplace=True)

In [78]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [79]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [80]:
df_2024.sample(10)

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
669717,"Coruña,",Padrón,Females,86,2024,36,15065,padron,"coruna,"
600499,Ciudad,Almuradiel,Females,24,2024,1,13016,almuradiel,ciudad
913079,Guadalajara,Luzaga,Total,76,2024,0,19162,luzaga,guadalajara
2154519,Valencia/València,Beniatjar,Total,74,2024,3,46056,beniatjar,valencia/valencia
551646,Cádiz,Torre,Total,29,2024,6,11036,torre,cadiz
2401843,Zaragoza,Alcalá,Males,48,2024,0,50013,alcala,zaragoza
141452,Ávila,Herreros,Males,79,2024,1,5094,herreros,avila
1409191,Murcia,Ricote,Males,60,2024,11,30034,ricote,murcia
18844,Albacete,Ayna,Females,75,2024,9,2011,ayna,albacete
1911905,Soria,Espeja,Males,16,2024,1,42080,espeja,soria


In [25]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [82]:
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [83]:
df_2024.sample(10)

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
1077232,Jaén,Baños,Females,9,2024,13,23011,banos,jaen
656762,"Coruña,",Carral,Males,85,2024,17,15021,carral,"coruna,"
1071595,Huesca,Valle,Total,84,2024,5,22901,valle,huesca
1672305,Salamanca,Fresno,Males,14,2024,0,37134,fresno,salamanca
1306561,Lugo,"Pontenova,",Total,42,2024,21,27048,"pontenova,",lugo
1063453,Huesca,Sopeira,Females,0,2024,0,22223,sopeira,huesca
156938,Ávila,Narros,Total,61,2024,0,5148,narros,avila
1229222,Lleida,Torres,Males,19,2024,4,25232,torres,lleida
1010734,Huelva,Villanueva,Males,15,2024,21,21076,villanueva,huelva
1652985,Salamanca,Buenavista,Total,74,2024,1,37060,buenavista,salamanca


In [84]:
df_2024.to_csv("../data/processed/filtered_age.csv", index=False)