In [38]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [39]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [40]:
df_raw.head()

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
0,National Total,,,Males,All ages,2024,23.826.871
1,National Total,,,Males,All ages,2023,23.565.593
2,National Total,,,Males,All ages,2022,23.288.747
3,National Total,,,Males,All ages,2021,23.248.611
4,National Total,,,Males,0 years old,2024,164.763


In [41]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018440 entries, 0 to 10018439
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   National Total  object
 1   Provinces       object
 2   Municipalities  object
 3   Sex             object
 4   Age             object
 5   Periodo         int64 
 6   Total           object
dtypes: int64(1), object(6)
memory usage: 535.0+ MB


In [42]:
df_raw.sample(15)

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
8843165,National Total,46 Valencia/València,46193 Picanya,Total,42 years old,2023,177
5596441,National Total,29 Málaga,29074 Montejaque,Males,77 years old,2023,6
9617003,National Total,49 Zamora,49243 Villaferrueña,Males,7 years old,2021,0
8460555,National Total,45 Toledo,45087 Madridejos,Males,65 years old,2021,65
7296579,National Total,40 Segovia,40065 Chañe,Males,77 years old,2021,5
6928433,National Total,37 Salamanca,37296 Santiago de la Puebla,Females,45 years old,2023,0
137814,National Total,02 Albacete,02059 Peñascosa,Females,78 years old,2022,0
565806,National Total,05 Ávila,05087 Gotarrendura,Males,78 years old,2022,1
139554,National Total,02 Albacete,02061 Pétrola,Males,3 years old,2022,0
9035082,National Total,47 Valladolid,47084 Mayorga,Females,81 years old,2022,8


In [43]:
df_raw.drop(columns=['National Total'], inplace=True)

In [44]:
df_raw.isnull().sum()

Provinces          1224
Municipalities    64872
Sex                   0
Age                   0
Periodo               0
Total               918
dtype: int64

In [45]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1064880,08 Barcelona,,Males,All ages,2024,2.870.721
1064881,08 Barcelona,,Males,All ages,2023,2.830.260
1064882,08 Barcelona,,Males,All ages,2022,2.783.698
1064883,08 Barcelona,,Males,All ages,2021,2.785.890
1064884,08 Barcelona,,Males,0 years old,2024,20.185
...,...,...,...,...,...,...
1066099,08 Barcelona,,Total,99 years old,2021,1.096
1066100,08 Barcelona,,Total,100 years or more,2024,1.982
1066101,08 Barcelona,,Total,100 years or more,2023,1.807
1066102,08 Barcelona,,Total,100 years or more,2022,1.707


### We have information about provinces without the municipalities, we should drop them

In [46]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
4920924,25 Lleida,"25206 Soleràs, El",Females,8 years old,2024,3
4837470,25 Lleida,25126 Llavorsí,Males,54 years old,2022,5
4934366,25 Lleida,25219 Tarroja de Segarra,Females,2 years old,2022,0
4876549,25 Lleida,"25163 Coma i la Pedra, La",Males,32 years old,2023,1
4880512,25 Lleida,25166 Pinell de Solsonès,Females,3 years old,2024,1


In [47]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [48]:

prov_null[(prov_null["Provinces"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [49]:
df_raw[df_raw['Total'].isnull()].sample(30)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
9348551,48 Bizkaia,48916 NA,Total,10 years old,2021,
9348693,48 Bizkaia,48916 NA,Total,46 years old,2023,
9348506,48 Bizkaia,48916 NA,Total,All ages,2022,
9348455,48 Bizkaia,48916 NA,Females,88 years old,2021,
9348790,48 Bizkaia,48916 NA,Total,70 years old,2022,
9348626,48 Bizkaia,48916 NA,Total,29 years old,2022,
9348511,48 Bizkaia,48916 NA,Total,0 years old,2021,
9347773,48 Bizkaia,48916 NA,Males,20 years old,2023,
9348871,48 Bizkaia,48916 NA,Total,90 years old,2021,
9348431,48 Bizkaia,48916 NA,Females,82 years old,2021,


In [50]:
df_raw.sample(20)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
3750243,19 Guadalajara,19227 Prados Redondos,Total,77 years old,2021,1
2579983,14 Córdoba,14044 Monturque,Total,48 years old,2021,30
5035838,"26 Rioja, La",26039 Canillas de Río Tuerto,Males,74 years old,2022,1
8260704,44 Teruel,44180 Peracense,Total,83 years old,2024,2
700305,05 Ávila,05205 Sanchorreja,Males,43 years old,2023,0
3680810,19 Guadalajara,19167 Malaguilla,Males,59 years old,2022,0
108152,02 Albacete,"02035 Gineta, La",Females,7 years old,2024,20
3129156,17 Girona,17087 Juià,Females,50 years old,2024,6
5480904,28 Madrid,28166 Valverde de Alcalá,Total,59 years old,2024,9
2309835,12 Castellón/Castelló,12071 Jérica,Males,35 years old,2021,5


In [51]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [52]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [53]:
df_2024.reset_index(drop=True, inplace=True)

In [54]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [55]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [56]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [57]:
(df_2024["age"].str.contains(r"^\d"))

1          True
2          True
3          True
4          True
5          True
           ... 
2488387    True
2488388    True
2488389    True
2488390    True
2488391    True
Name: age, Length: 2463996, dtype: bool

In [58]:
from scripts.utils import split_column_at

df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)

In [59]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [60]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [69]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
865444,Granada,Vélez,Males,75,2024,21,18,184,velez,granada
1519816,Ourense,Sarreaus,Total,15,2024,3,32,78,sarreaus,ourense
1239380,"Rioja,",Agoncillo,Males,79,2024,5,26,2,agoncillo,"rioja,"
1721244,Salamanca,Santibáñez,Total,95,2024,0,37,298,santibanez,salamanca
2252630,Valladolid,Peñafiel,Females,61,2024,50,47,114,penafiel,valladolid
792055,Girona,Rabós,Females,24,2024,0,17,143,rabos,girona
947935,Guadalajara,Tórtola,Total,48,2024,24,19,286,tortola,guadalajara
2221460,Valladolid,Amusquillo,Total,1,2024,1,47,9,amusquillo,valladolid
1508245,Ourense,Leiro,Total,72,2024,36,32,40,leiro,ourense
413449,Burgos,Navas,Males,42,2024,0,9,230,navas,burgos


In [62]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [63]:
df_2024[df_2024['cmun'] == 1902]

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean


In [None]:
df_2024['mu']

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,01,01001,alegria-dulantzi,araba/alava
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,01,01001,alegria-dulantzi,araba/alava
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,01,01001,alegria-dulantzi,araba/alava
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,01,01001,alegria-dulantzi,araba/alava
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,01,01001,alegria-dulantzi,araba/alava
...,...,...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,52,52001,melilla,melilla
2488388,Melilla,Melilla,Total,97,2024,13,52,52001,melilla,melilla
2488389,Melilla,Melilla,Total,98,2024,11,52,52001,melilla,melilla
2488390,Melilla,Melilla,Total,99,2024,3,52,52001,melilla,melilla


In [65]:
df_2024["cmun"] = df_2024.apply(lambda row: str(row["cmun"])[len(str(row["cprov"])):], axis=1)


In [None]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,01,001,alegria-dulantzi,araba/alava
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,01,001,alegria-dulantzi,araba/alava
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,01,001,alegria-dulantzi,araba/alava
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,01,001,alegria-dulantzi,araba/alava
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,01,001,alegria-dulantzi,araba/alava
...,...,...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,52,001,melilla,melilla
2488388,Melilla,Melilla,Total,97,2024,13,52,001,melilla,melilla
2488389,Melilla,Melilla,Total,98,2024,11,52,001,melilla,melilla
2488390,Melilla,Melilla,Total,99,2024,3,52,001,melilla,melilla


In [34]:
df_2024["cprov"] = pd.to_numeric(df_2024["cprov"].str.replace(".", "", regex=False)).astype("int32")
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [35]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1099243,Jaén,Torres,Males,90,2024,4,23,91,torres,jaen
451045,Burgos,Valdeande,Males,0,2024,1,9,403,valdeande,burgos
393434,Burgos,Fuentenebro,Total,19,2024,0,9,140,fuentenebro,burgos
1213462,Lleida,Bellaguarda,Females,69,2024,2,25,170,bellaguarda,lleida
848966,Granada,Lanteira,Females,19,2024,3,18,117,lanteira,granada
1805981,Segovia,Carrascal,Total,70,2024,3,40,44,carrascal,segovia
2080617,Toledo,Bargas,Females,20,2024,53,45,19,bargas,toledo
2369332,Zamora,Quintanilla,Total,75,2024,3,49,168,quintanilla,zamora
4886,Araba/Álava,Kuartango,Total,91,2024,0,1,20,kuartango,araba/alava
1592318,Palencia,Valbuena,Females,97,2024,0,34,186,valbuena,palencia


In [None]:
df_2024.to_csv("../data/processed/filtered_age.csv", index=False)