In [1]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [2]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [3]:
df_raw.head()

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
0,National Total,,,Males,All ages,2024,23.826.871
1,National Total,,,Males,All ages,2023,23.565.593
2,National Total,,,Males,All ages,2022,23.288.747
3,National Total,,,Males,All ages,2021,23.248.611
4,National Total,,,Males,0 years old,2024,164.763


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018440 entries, 0 to 10018439
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   National Total  object
 1   Provinces       object
 2   Municipalities  object
 3   Sex             object
 4   Age             object
 5   Periodo         int64 
 6   Total           object
dtypes: int64(1), object(6)
memory usage: 535.0+ MB


In [5]:
df_raw.sample(15)

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
5505388,National Total,28 Madrid,28903 Tres Cantos,Total,60 years old,2024,827
2401910,National Total,13 Ciudad Real,13002 Agudo,Females,2 years old,2022,6
1086210,National Total,08 Barcelona,08017 Balenyà,Females,27 years old,2022,26
2501810,National Total,13 Ciudad Real,13083 Torralba de Calatrava,Total,89 years old,2022,15
3338887,National Total,18 Granada,18036 Cájar,Total,54 years old,2021,96
3402367,National Total,18 Granada,18100 Huétor Tájar,Total,12 years old,2021,123
2960523,National Total,16 Cuenca,16215 Tragacete,Total,17 years old,2021,1
7905103,National Total,43 Tarragona,"43054 Espluga de Francolí, L'",Females,24 years old,2021,19
6990703,National Total,37 Salamanca,"37349 Vídola, La",Females,6 years old,2021,0
9789650,National Total,50 Zaragoza,50115 Fuentes de Ebro,Males,23 years old,2022,28


In [6]:
df_raw.drop(columns=['National Total'], inplace=True)

In [7]:
df_raw.isnull().sum()

Provinces          1224
Municipalities    64872
Sex                   0
Age                   0
Periodo               0
Total               918
dtype: int64

In [8]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1064880,08 Barcelona,,Males,All ages,2024,2.870.721
1064881,08 Barcelona,,Males,All ages,2023,2.830.260
1064882,08 Barcelona,,Males,All ages,2022,2.783.698
1064883,08 Barcelona,,Males,All ages,2021,2.785.890
1064884,08 Barcelona,,Males,0 years old,2024,20.185
...,...,...,...,...,...,...
1066099,08 Barcelona,,Total,99 years old,2021,1.096
1066100,08 Barcelona,,Total,100 years or more,2024,1.982
1066101,08 Barcelona,,Total,100 years or more,2023,1.807
1066102,08 Barcelona,,Total,100 years or more,2022,1.707


### We have information about provinces without the municipalities, we should drop them

In [9]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
4803365,25 Lleida,"25092 Floresta, La",Males,96 years old,2023,0
4767624,25 Lleida,25056 Bovera,Males,35 years old,2024,2
4964600,25 Lleida,25250 Vilanova de Meià,Males,13 years old,2024,1
4898817,25 Lleida,25182 Puigverd de Lleida,Males,91 years old,2023,2
4861248,25 Lleida,25146 Navès,Females,83 years old,2024,0


In [10]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [11]:

prov_null[(prov_null["Provinces"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [12]:
df_raw[df_raw['Total'].isnull()].sample(30)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
9348101,48 Bizkaia,48916 NA,Females,0 years old,2023,
9348626,48 Bizkaia,48916 NA,Total,29 years old,2022,
9348211,48 Bizkaia,48916 NA,Females,27 years old,2021,
9347821,48 Bizkaia,48916 NA,Males,32 years old,2023,
9347930,48 Bizkaia,48916 NA,Males,59 years old,2022,
9348383,48 Bizkaia,48916 NA,Females,70 years old,2021,
9348334,48 Bizkaia,48916 NA,Females,58 years old,2022,
9348087,48 Bizkaia,48916 NA,Males,98 years old,2021,
9348091,48 Bizkaia,48916 NA,Males,99 years old,2021,
9348826,48 Bizkaia,48916 NA,Total,79 years old,2022,


In [13]:
df_raw.sample(20)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
7399155,40 Segovia,40163 Puebla de Pedraza,Males,17 years old,2021,0
6636229,37 Salamanca,37039 Bañobárez,Total,26 years old,2023,1
9634188,49 Zamora,49259 Villanueva de las Peras,Males,20 years old,2024,0
4533133,24 León,24078 Gordoncillo,Females,62 years old,2023,2
888915,06 Badajoz,"06090 Nava de Santiago, La",Males,71 years old,2021,5
533001,05 Ávila,05058 Cepeda la Mora,Females,37 years old,2023,1
563001,05 Ávila,05084 Gilbuena,Total,91 years old,2023,1
9458585,49 Zamora,49099 Losacio,Females,81 years old,2023,0
8156525,44 Teruel,44085 Cosa,Total,48 years old,2023,1
2862722,16 Cuenca,16116 Lagunaseca,Total,47 years old,2022,0


In [14]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [15]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [16]:
df_2024.reset_index(drop=True, inplace=True)

In [17]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [18]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [19]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [20]:
(df_2024["age"].str.contains(r"^\d"))

1          True
2          True
3          True
4          True
5          True
           ... 
2488387    True
2488388    True
2488389    True
2488390    True
2488391    True
Name: age, Length: 2463996, dtype: bool

In [21]:
from scripts.utils import split_column_at

df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)

In [22]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [23]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [24]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1853412,Segovia,Veganzones,Total,71,2024,0,40,40222,veganzones,segovia
346098,Barcelona,Sora,Males,11,2024,2,8,8272,sora,barcelona
534515,Cáceres,Valverde,Total,34,2024,4,10,10204,valverde,caceres
280993,Barcelona,Castell,Males,84,2024,0,8,8057,castell,barcelona
270976,Barcelona,Borredà,Females,63,2024,3,8,8024,borredà,barcelona
399382,Burgos,Huérmeces,Males,51,2024,1,9,9172,huermeces,burgos
271229,Barcelona,"Bruc,",Females,10,2024,13,8,8025,"bruc,",barcelona
730864,Cuenca,Santa,Females,33,2024,2,16,16196,santa,cuenca
1161122,León,Villademor,Females,55,2024,1,24,24207,villademor,leon
2467133,Zaragoza,San,Females,58,2024,1,50,50234,san,zaragoza


In [25]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [26]:
df_2024[df_2024['cmun'] == 1902]

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean


In [27]:
df_2024["cmun"] = df_2024.apply(lambda row: str(row["cmun"])[len(str(row["cprov"])):], axis=1)


In [28]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
2289504,Bizkaia,Munitibar-Arbatzegi,Males,11,2024,1,48,7,munitibar-arbatzegi,bizkaia
1368974,Málaga,Algarrobo,Total,31,2024,86,29,5,algarrobo,malaga
499140,Cáceres,Guijo,Males,53,2024,13,10,89,guijo,caceres
59244,Alicante/Alacant,Cocentaina,Females,83,2024,55,3,56,cocentaina,alicante/alacant
1810086,Segovia,Collado,Males,95,2024,0,40,59,collado,segovia
351095,Barcelona,Torrelles,Females,10,2024,10,8,288,torrelles,barcelona
816600,Girona,Vilamalla,Females,89,2024,3,17,226,vilamalla,girona
1460838,Navarra,Lodosa,Total,95,2024,4,31,157,lodosa,navarra
868453,Granada,Nevada,Males,24,2024,4,18,903,nevada,granada
1711348,Salamanca,Rágama,Females,93,2024,0,37,265,ragama,salamanca


In [29]:
df_2024["cprov"] = pd.to_numeric(df_2024["cprov"].str.replace(".", "", regex=False)).astype("int32")
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [30]:
df_2024.sample(20)

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun,municipality_clean,province_clean
1772570,Cantabria,Hermandad,Total,13,2024,6,39,32,hermandad,cantabria
2053571,Teruel,Pozuel,Males,4,2024,0,44,190,pozuel,teruel
1193211,Lleida,Fondarella,Females,14,2024,5,25,93,fondarella,lleida
2033477,Teruel,Gargallo,Females,4,2024,0,44,116,gargallo,teruel
899788,Guadalajara,Escariche,Females,45,2024,0,19,111,escariche,guadalajara
2191075,Valencia/València,Montroi/Montroy,Females,12,2024,14,46,176,montroi/montroy,valencia/valencia
1325209,Madrid,Cobeña,Total,24,2024,98,28,41,cobena,madrid
2436111,Zaragoza,Jarque,Males,44,2024,5,50,130,jarque,zaragoza
322363,Barcelona,Sant,Females,42,2024,293,8,194,sant,barcelona
165997,Ávila,Papatrigo,Females,42,2024,3,5,179,papatrigo,avila


In [31]:
df_2024.to_csv("../data/processed/filtered_age.csv", index=False)