In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

# First Part: Data Cleanup

In [2]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [3]:
df_raw.head()

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
0,National Total,,,Males,All ages,2024,23.826.871
1,National Total,,,Males,All ages,2023,23.565.593
2,National Total,,,Males,All ages,2022,23.288.747
3,National Total,,,Males,All ages,2021,23.248.611
4,National Total,,,Males,0 years old,2024,164.763


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018440 entries, 0 to 10018439
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   National Total  object
 1   Provinces       object
 2   Municipalities  object
 3   Sex             object
 4   Age             object
 5   Periodo         int64 
 6   Total           object
dtypes: int64(1), object(6)
memory usage: 535.0+ MB


In [5]:
df_raw.sample(15)

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
4505925,National Total,24 León,24054 Cimanes de la Vega,Males,94 years old,2023,0
6541284,National Total,36 Pontevedra,36026 Marín,Males,56 years old,2024,208
2713877,National Total,"15 Coruña, A",15078 Santiago de Compostela,Males,66 years old,2023,549
8568230,National Total,45 Toledo,45175 Turleque,Males,56 years old,2022,5
7429195,National Total,40 Segovia,40191 Santo Tomé del Puerto,Females,81 years old,2021,1
7089957,National Total,38 Santa Cruz de Tenerife,38047 Tijarafe,Females,34 years old,2023,9
7450742,National Total,40 Segovia,40211 Valdeprados,Males,62 years old,2022,1
6100458,National Total,32 Ourense,"32063 Pobra de Trives, A",Males,9 years old,2022,8
9057200,National Total,47 Valladolid,47102 Nueva Villa de las Torres,Total,1 year old,2024,0
3378730,National Total,18 Granada,18076 Fonelas,Females,19 years old,2022,2


In [6]:
df_raw.drop(columns=['National Total'], inplace=True)

In [7]:
df_raw.isnull().sum()

Provinces          1224
Municipalities    64872
Sex                   0
Age                   0
Periodo               0
Total               918
dtype: int64

In [8]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1064880,08 Barcelona,,Males,All ages,2024,2.870.721
1064881,08 Barcelona,,Males,All ages,2023,2.830.260
1064882,08 Barcelona,,Males,All ages,2022,2.783.698
1064883,08 Barcelona,,Males,All ages,2021,2.785.890
1064884,08 Barcelona,,Males,0 years old,2024,20.185
...,...,...,...,...,...,...
1066099,08 Barcelona,,Total,99 years old,2021,1.096
1066100,08 Barcelona,,Total,100 years or more,2024,1.982
1066101,08 Barcelona,,Total,100 years or more,2023,1.807
1066102,08 Barcelona,,Total,100 years or more,2022,1.707


### We have information about provinces without the municipalities, we should drop them

In [9]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
4969498,25 Lleida,25254 Vilanova de la Barca,Males,13 years old,2022,4
4925898,25 Lleida,25210 Soses,Females,27 years old,2022,3
4784474,25 Lleida,25071 Cava,Total,65 years old,2022,3
4971487,25 Lleida,25255 Vinaixa,Total,0 years old,2021,3
4847437,25 Lleida,25134 Menàrguens,Males,98 years old,2023,0


In [10]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [11]:

prov_null[(prov_null["Provinces"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [12]:
df_raw[df_raw['Total'].isnull()].sample(30)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
9348473,48 Bizkaia,48916 NA,Females,93 years old,2023,
9347914,48 Bizkaia,48916 NA,Males,55 years old,2022,
9348282,48 Bizkaia,48916 NA,Females,45 years old,2022,
9348351,48 Bizkaia,48916 NA,Females,62 years old,2021,
9347758,48 Bizkaia,48916 NA,Males,16 years old,2022,
9348706,48 Bizkaia,48916 NA,Total,49 years old,2022,
9348743,48 Bizkaia,48916 NA,Total,58 years old,2021,
9348431,48 Bizkaia,48916 NA,Females,82 years old,2021,
9347722,48 Bizkaia,48916 NA,Males,7 years old,2022,
9348862,48 Bizkaia,48916 NA,Total,88 years old,2022,


In [13]:
df_raw.sample(20)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
812280,06 Badajoz,06027 Calzadilla de los Barros,Females,89 years old,2024,2.0
2600943,14 Córdoba,14061 Santa Eufemia,Total,86 years old,2021,8.0
4496983,24 León,24046 Castrocalbón,Males,0 years old,2021,1.0
6852284,37 Salamanca,37233 Pastores,Males,82 years old,2024,1.0
9442140,49 Zamora,49085 Galende,Males,50 years old,2024,7.0
8394508,45 Toledo,45032 Camarenilla,Males,78 years old,2024,6.0
6761486,37 Salamanca,37156 Guijuelo,Males,26 years old,2022,37.0
6691261,37 Salamanca,37092 Castellanos de Moriscos,Total,14 years old,2023,34.0
1184224,08 Barcelona,08097 Gualba,Females,51 years old,2024,14.0
889284,06 Badajoz,"06090 Nava de Santiago, La",Females,62 years old,2024,14.0


In [14]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [15]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [16]:
df_2024.reset_index(drop=True, inplace=True)

In [17]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [18]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [19]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [20]:
(df_2024["age"].str.contains(r"^\d"))

1          True
2          True
3          True
4          True
5          True
           ... 
2488387    True
2488388    True
2488389    True
2488390    True
2488391    True
Name: age, Length: 2463996, dtype: bool

In [21]:
df_2024.head()

Unnamed: 0,province,municipality,sex,age,year,total
1,01 Araba/Álava,01001 Alegría-Dulantzi,Males,0 years old,2024,11
2,01 Araba/Álava,01001 Alegría-Dulantzi,Males,1 year old,2024,9
3,01 Araba/Álava,01001 Alegría-Dulantzi,Males,2 years old,2024,15
4,01 Araba/Álava,01001 Alegría-Dulantzi,Males,3 years old,2024,12
5,01 Araba/Álava,01001 Alegría-Dulantzi,Males,4 years old,2024,9


In [22]:
from scripts.utils import split_column_at
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)

In [23]:
df_2024

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,Araba/Álava,01001
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,Araba/Álava,01001
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,Araba/Álava,01001
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,Araba/Álava,01001
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,Araba/Álava,01001
...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,Melilla,52001
2488388,Melilla,Melilla,Total,97,2024,13,Melilla,52001
2488389,Melilla,Melilla,Total,98,2024,11,Melilla,52001
2488390,Melilla,Melilla,Total,99,2024,3,Melilla,52001


In [24]:
df_2024.drop("cprov", axis = 1, inplace=True)

In [25]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [26]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [27]:
df_2024.sample(10)

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
587094,Castellón/Castelló,Torralba,Females,83,2024,0,12116,torralba,castellon/castello
1200730,Lleida,Lladorre,Total,87,2024,1,25123,lladorre,lleida
326960,Barcelona,Sant,Females,49,2024,63,8209,sant,barcelona
1162577,León,Villamartín,Males,82,2024,0,24213,villamartin,leon
1197336,Lleida,Guissona,Total,59,2024,73,25110,guissona,lleida
774054,Girona,Ger,Females,77,2024,3,17078,ger,girona
578902,Castellón/Castelló,Peníscola/Peñíscola,Total,51,2024,127,12089,peniscola/peniscola,castellon/castello
2311481,Bizkaia,Portugalete,Total,58,2024,738,48078,portugalete,bizkaia
2235704,Valladolid,Corrales,Males,67,2024,2,47056,corrales,valladolid
941772,Guadalajara,Taragudo,Total,5,2024,0,19263,taragudo,guadalajara


In [28]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [29]:
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [30]:
df_2024.sample(10)

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
888025,Guadalajara,Campillo,Males,12,2024,0,19060,campillo,guadalajara
393155,Burgos,Fuentemolinos,Total,46,2024,0,9139,fuentemolinos,burgos
1652707,Salamanca,Buenavista,Males,0,2024,1,37060,buenavista,salamanca
138454,Ávila,Gemuño,Females,39,2024,0,5083,gemuno,avila
1670002,Salamanca,Espadaña,Females,57,2024,0,37126,espadana,salamanca
40467,Albacete,Villatoya,Males,74,2024,0,2082,villatoya,albacete
1988342,Tarragona,Santa,Total,55,2024,59,43138,santa,tarragona
1442537,Navarra,Esteribar,Males,52,2024,34,31098,esteribar,navarra
1151380,León,Torre,Total,3,2024,6,24170,torre,leon
260488,"Balears,",Santanyí,Males,81,2024,27,7057,santanyi,"balears,"


In [31]:
df_2024.to_csv("../data/large_files/filtered_age.csv", index=False)

# Second Part: Binning and Pivoting


In [32]:
df_ages_bined = pd.read_csv("../data/large_files/filtered_age.csv")

df_ages_bined.shape
df_ages_bined.head()

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
0,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,1001,alegria-dulantzi,araba/alava
1,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,1001,alegria-dulantzi,araba/alava
2,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,1001,alegria-dulantzi,araba/alava
3,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,1001,alegria-dulantzi,araba/alava
4,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,1001,alegria-dulantzi,araba/alava


In [33]:
df_demographics = df_ages_bined.query("sex != 'Total'")

In [34]:
df_demographics.isna().sum()

province                0
municipality          202
sex                     0
age                     0
year                    0
total                   0
cmun                    0
municipality_clean      0
province_clean          0
dtype: int64

In [35]:
def assign_age_group(age):
    if age <= 17:
        return '0-17'
    elif age <= 24:
        return '18-24'
    elif age <= 34:
        return '25-34'
    elif age <= 54:
        return '35-54'
    else:
        return '55+'

df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)


In [36]:
df_demographics["age_group"].unique()

array(['0-17', '18-24', '25-34', '35-54', '55+'], dtype=object)

In [37]:
# 2. Pivot the table to have one column per age group
#    We group by 'cmun' (or whichever municipality identifier you prefer)
pivot_df = df_demographics.pivot_table(
    index='cmun',
    columns='age_group',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()


In [38]:
age_group_order = ['0-17', '18-24', '25-34', '35-54', '55+']
pivot_df = pivot_df[['cmun'] + [grp for grp in age_group_order if grp in pivot_df.columns]]
pivot_df["total_population"] = pivot_df["0-17"] + pivot_df["18-24"] + pivot_df["25-34"] + pivot_df["35-54"] + pivot_df["55+"]

pivot_df

age_group,cmun,0-17,18-24,25-34,35-54,55+,total_population
0,1001,615,250,268,1005,827,2965
1,1002,1826,588,862,2961,4075,10312
2,1003,218,113,108,349,592,1380
3,1004,352,127,146,576,655,1856
4,1006,42,20,18,96,70,246
...,...,...,...,...,...,...,...
8127,50901,6,8,16,37,103,170
8128,50902,4,2,4,33,42,85
8129,50903,405,224,320,789,1114,2852
8130,51001,18151,8386,10785,23963,21894,83179


In [39]:
pivot_df_sex = df_demographics.pivot_table(
    index='cmun',
    columns='sex',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()

In [40]:
df_demographics["sex"].unique()

array(['Males', 'Females'], dtype=object)

In [41]:
sex_group_order = ['Males', 'Females']

pivot_df_sex = pivot_df_sex[['cmun'] + [grp for grp in sex_group_order if grp in pivot_df_sex.columns]]
pivot_df_sex.rename(columns={"Males": "male", "Females": "female"}, inplace=True)

pivot_df_sex["total_sex"] = pivot_df_sex["male"] + pivot_df_sex["female"]
pivot_df_sex

sex,cmun,male,female,total_sex
0,1001,1525,1440,2965
1,1002,5134,5178,10312
2,1003,709,671,1380
3,1004,914,942,1856
4,1006,127,119,246
...,...,...,...,...
8127,50901,104,66,170
8128,50902,43,42,85
8129,50903,1444,1408,2852
8130,51001,41957,41222,83179


In [42]:
df_demographics_combined = pivot_df.merge(df_demographics, on='cmun', how='left')
df_demographics_combined = pivot_df_sex.merge(df_demographics_combined, on='cmun', how='left')

df_demographics_combined


Unnamed: 0,cmun,male,female,total_sex,0-17,18-24,25-34,35-54,55+,total_population,province,municipality,sex,age,year,total,municipality_clean,province_clean,age_group
0,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,alegria-dulantzi,araba/alava,0-17
1,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,alegria-dulantzi,araba/alava,0-17
2,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,alegria-dulantzi,araba/alava,0-17
3,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,alegria-dulantzi,araba/alava,0-17
4,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,alegria-dulantzi,araba/alava,0-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,96,2024,10,melilla,melilla,55+
1642660,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,97,2024,11,melilla,melilla,55+
1642661,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,98,2024,9,melilla,melilla,55+
1642662,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,99,2024,1,melilla,melilla,55+


In [43]:
df_demographics_combined = df_demographics_combined.drop(["year", "municipality", "province", "total_sex"], axis = 1)



In [44]:
df_demographics_combined

Unnamed: 0,cmun,male,female,0-17,18-24,25-34,35-54,55+,total_population,sex,age,total,municipality_clean,province_clean,age_group
0,1001,1525,1440,615,250,268,1005,827,2965,Males,0,11,alegria-dulantzi,araba/alava,0-17
1,1001,1525,1440,615,250,268,1005,827,2965,Males,1,9,alegria-dulantzi,araba/alava,0-17
2,1001,1525,1440,615,250,268,1005,827,2965,Males,2,15,alegria-dulantzi,araba/alava,0-17
3,1001,1525,1440,615,250,268,1005,827,2965,Males,3,12,alegria-dulantzi,araba/alava,0-17
4,1001,1525,1440,615,250,268,1005,827,2965,Males,4,9,alegria-dulantzi,araba/alava,0-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,96,10,melilla,melilla,55+
1642660,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,97,11,melilla,melilla,55+
1642661,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,98,9,melilla,melilla,55+
1642662,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,99,1,melilla,melilla,55+


In [45]:
df_demographics_combined = df_demographics_combined[["cmun", "municipality_clean", "province_clean", "0-17", "18-24", "25-34", "35-54","55+", "male", "female","total_population"]]

In [46]:
df_demographics_combined

Unnamed: 0,cmun,municipality_clean,province_clean,0-17,18-24,25-34,35-54,55+,male,female,total_population
0,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
1,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
2,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
3,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
4,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642660,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642661,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642662,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985


In [47]:
df_demographics_combined = df_demographics_combined.drop_duplicates()
df_demographics_combined

Unnamed: 0,cmun,municipality_clean,province_clean,0-17,18-24,25-34,35-54,55+,male,female,total_population
0,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
202,1002,amurrio,araba/alava,1826,588,862,2961,4075,5134,5178,10312
404,1003,aramaio,araba/alava,218,113,108,349,592,709,671,1380
606,1004,artziniega,araba/alava,352,127,146,576,655,914,942,1856
808,1006,arminon,araba/alava,42,20,18,96,70,127,119,246
...,...,...,...,...,...,...,...,...,...,...,...
1641654,50901,biel,zaragoza,6,8,16,37,103,104,66,170
1641856,50902,marracos,zaragoza,4,2,4,33,42,43,42,85
1642058,50903,villamayor,zaragoza,405,224,320,789,1114,1444,1408,2852
1642260,51001,ceuta,ceuta,18151,8386,10785,23963,21894,41957,41222,83179


In [48]:
df_demographics_combined.to_csv("../data/processed/filtered_files/filtered_demographics.csv", index=False)