In [3]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [113]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [114]:
df_raw.head()

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
0,National Total,,,Males,All ages,2024,23.826.871
1,National Total,,,Males,All ages,2023,23.565.593
2,National Total,,,Males,All ages,2022,23.288.747
3,National Total,,,Males,All ages,2021,23.248.611
4,National Total,,,Males,0 years old,2024,164.763


In [115]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10018440 entries, 0 to 10018439
Data columns (total 7 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   National Total  object
 1   Provinces       object
 2   Municipalities  object
 3   Sex             object
 4   Age             object
 5   Periodo         int64 
 6   Total           object
dtypes: int64(1), object(6)
memory usage: 535.0+ MB


In [116]:
df_raw.sample(15)

Unnamed: 0,National Total,Provinces,Municipalities,Sex,Age,Periodo,Total
9483284,National Total,49 Zamora,49120 Molezuelas de la Carballeda,Total,34 years old,2024,0.0
9241808,National Total,48 Bizkaia,48026 Dima,Females,49 years old,2024,8.0
7638194,National Total,42 Soria,42022 Almenar de Soria,Females,5 years old,2022,0.0
1306208,National Total,08 Barcelona,08199 Sant Bartomeu del Grau,Males,49 years old,2024,13.0
7777727,National Total,42 Soria,42161 Salduero,Females,4 years old,2021,0.0
215452,National Total,03 Alicante/Alacant,03035 Benilloba,Males,6 years old,2024,2.0
7025675,National Total,37 Salamanca,37377 Yecla de Yeltes,Total,79 years old,2021,3.0
1019650,National Total,"07 Balears, Illes",07031 Llucmajor,Males,13 years old,2022,259.0
2323783,National Total,12 Castellón/Castelló,12082 Nules,Females,54 years old,2021,83.0
6287001,National Total,34 Palencia,34056 Cervera de Pisuerga,Females,31 years old,2023,5.0


In [117]:
df_raw.drop(columns=['National Total'], inplace=True)

In [118]:
df_raw.isnull().sum()

Provinces          1224
Municipalities    64872
Sex                   0
Age                   0
Periodo               0
Total               918
dtype: int64

In [119]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
1064880,08 Barcelona,,Males,All ages,2024,2.870.721
1064881,08 Barcelona,,Males,All ages,2023,2.830.260
1064882,08 Barcelona,,Males,All ages,2022,2.783.698
1064883,08 Barcelona,,Males,All ages,2021,2.785.890
1064884,08 Barcelona,,Males,0 years old,2024,20.185
...,...,...,...,...,...,...
1066099,08 Barcelona,,Total,99 years old,2021,1.096
1066100,08 Barcelona,,Total,100 years or more,2024,1.982
1066101,08 Barcelona,,Total,100 years or more,2023,1.807
1066102,08 Barcelona,,Total,100 years or more,2022,1.707


### We have information about provinces without the municipalities, we should drop them

In [120]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
4869691,25 Lleida,"25154 Omells de na Gaia, Els",Females,51 years old,2021,3
4798328,25 Lleida,25086 Esterri d'Àneu,Males,61 years old,2024,6
4826240,25 Lleida,25115 Isona i Conca Dellà,Males,1 year old,2024,4
4953894,25 Lleida,"25239 Valls de Valira, Les",Males,90 years old,2022,0
4833806,25 Lleida,25123 Lladorre,Males,56 years old,2022,1


In [121]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [122]:

prov_null[(prov_null["Provinces"].isnull())]

Unnamed: 0,Provinces,Municipalities
0,,
1,,
2,,
3,,
4,,
...,...,...
1219,,
1220,,
1221,,
1222,,


In [123]:
df_raw[df_raw['Total'].isnull()].sample(30)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
9348363,48 Bizkaia,48916 NA,Females,65 years old,2021,
9348491,48 Bizkaia,48916 NA,Females,97 years old,2021,
9348339,48 Bizkaia,48916 NA,Females,59 years old,2021,
9348895,48 Bizkaia,48916 NA,Total,96 years old,2021,
9348353,48 Bizkaia,48916 NA,Females,63 years old,2023,
9348497,48 Bizkaia,48916 NA,Females,99 years old,2023,
9348035,48 Bizkaia,48916 NA,Males,85 years old,2021,
9347777,48 Bizkaia,48916 NA,Males,21 years old,2023,
9348830,48 Bizkaia,48916 NA,Total,80 years old,2022,
9347994,48 Bizkaia,48916 NA,Males,75 years old,2022,


In [124]:
df_raw.sample(20)

Unnamed: 0,Provinces,Municipalities,Sex,Age,Periodo,Total
7929752,43 Tarragona,43074 Llorenç del Penedès,Females,67 years old,2024,14
5478027,28 Madrid,28164 Valdetorres de Jarama,Females,53 years old,2021,43
5236424,27 Lugo,27028 Lugo,Males,37 years old,2024,526
1086440,08 Barcelona,08017 Balenyà,Females,85 years old,2024,6
2044382,10 Cáceres,10117 Marchagaz,Males,74 years old,2022,2
2836799,16 Cuenca,16093 Garaballa,Females,94 years old,2021,0
7256283,40 Segovia,40028 Bercial,Females,All ages,2021,48
5563498,29 Málaga,29047 Cuevas Bajas,Females,1 year old,2022,8
2362407,12 Castellón/Castelló,"12115 Toro, El",Males,20 years old,2021,0
2874347,16 Cuenca,16128 Monreal del Llano,Males,97 years old,2021,0


In [125]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

KeyboardInterrupt: 

In [None]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [None]:
df_2024.reset_index(drop=True, inplace=True)

In [None]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [None]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [None]:
(df_2024["age"].str.contains(r"^\d"))

1          True
2          True
3          True
4          True
5          True
           ... 
2488387    True
2488388    True
2488389    True
2488390    True
2488391    True
Name: age, Length: 2463996, dtype: bool

In [None]:
df_2024.head()

Unnamed: 0,province,municipality,sex,age,year,total
1,01 Araba/Álava,01001 Alegría-Dulantzi,Males,0 years old,2024,11
2,01 Araba/Álava,01001 Alegría-Dulantzi,Males,1 year old,2024,9
3,01 Araba/Álava,01001 Alegría-Dulantzi,Males,2 years old,2024,15
4,01 Araba/Álava,01001 Alegría-Dulantzi,Males,3 years old,2024,12
5,01 Araba/Álava,01001 Alegría-Dulantzi,Males,4 years old,2024,9


In [None]:
from scripts.utils import split_column_at
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)

In [None]:
df_2024

Unnamed: 0,province,municipality,sex,age,year,total,cprov,cmun
1,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,Araba/Álava,01001
2,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,Araba/Álava,01001
3,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,Araba/Álava,01001
4,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,Araba/Álava,01001
5,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,Araba/Álava,01001
...,...,...,...,...,...,...,...,...
2488387,Melilla,Melilla,Total,96,2024,19,Melilla,52001
2488388,Melilla,Melilla,Total,97,2024,13,Melilla,52001
2488389,Melilla,Melilla,Total,98,2024,11,Melilla,52001
2488390,Melilla,Melilla,Total,99,2024,3,Melilla,52001


In [None]:
df_2024.drop("cprov", axis = 1, inplace=True)

In [None]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [None]:
df_2024.sample(10)

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
725780,Cuenca,Valle,Total,49,2024,2,16173,valle,cuenca
2369008,Zamora,Pueblica,Total,57,2024,1,49167,pueblica,zamora
1303350,Lugo,Ourol,Males,95,2024,3,27038,ourol,lugo
1969408,Tarragona,Margalef,Total,93,2024,2,43075,margalef,tarragona
1123127,León,"Ercina,",Females,4,2024,0,24068,"ercina,",leon
542569,Cádiz,Barbate,Males,30,2024,131,11007,barbate,cadiz
1965611,Tarragona,"Galera,",Females,70,2024,8,43063,"galera,",tarragona
489524,Cáceres,Casas,Total,25,2024,3,10056,casas,caceres
2039052,Teruel,Libros,Females,71,2024,0,44135,libros,teruel
1575715,Palencia,Olmos,Females,18,2024,1,34114,olmos,palencia


In [None]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [None]:
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
df_2024.sample(10)

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
1166710,León,Villaturiel,Total,33,2024,17,24227,villaturiel,leon
2294260,Bizkaia,Karrantza,Females,75,2024,22,48022,karrantza,bizkaia
184056,Ávila,"Tiemblo,",Females,47,2024,23,5241,"tiemblo,",avila
347791,Barcelona,Talamanca,Females,72,2024,3,8277,talamanca,barcelona
1917660,Soria,Hinojosa,Total,59,2024,1,42100,hinojosa,soria
1685783,Salamanca,Mancera,Males,28,2024,0,37179,mancera,salamanca
304214,Barcelona,Montmaneu,Males,49,2024,1,8133,montmaneu,barcelona
1283621,"Rioja,",Torrecilla,Total,52,2024,0,26152,torrecilla,"rioja,"
362573,Burgos,Anguix,Total,64,2024,1,9017,anguix,burgos
1032061,Huesca,Canfranc,Total,24,2024,10,22078,canfranc,huesca


## Grouping


In [107]:
df_ages_bined = pd.read_csv("../data/large_files/filtered_age.csv")

In [109]:
df_demographics = df_ages_bined.query("sex != 'Total'")

In [110]:
df_demographics.isna().sum()

province                0
municipality          202
sex                     0
age                     0
year                    0
total                   0
cmun                    0
municipality_clean      0
province_clean          0
dtype: int64

In [111]:
def assign_age_group(age):
    if age <= 17:
        return '0-17'
    elif age <= 24:
        return '18-24'
    elif age <= 34:
        return '25-34'
    elif age <= 54:
        return '35-54'
    else:
        return '55+'

df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)


In [112]:
df_demographics["age_group"].unique()

array(['0-17', '18-24', '25-34', '35-54', '55+'], dtype=object)

In [113]:
# 2. Pivot the table to have one column per age group
#    We group by 'cmun' (or whichever municipality identifier you prefer)
pivot_df = df_demographics.pivot_table(
    index='cmun',
    columns='age_group',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()


In [114]:
age_group_order = ['0-17', '18-24', '25-34', '35-54', '55+']
pivot_df = pivot_df[['cmun'] + [grp for grp in age_group_order if grp in pivot_df.columns]]
pivot_df["total_population"] = pivot_df["0-17"] + pivot_df["18-24"] + pivot_df["25-34"] + pivot_df["35-54"] + pivot_df["55+"]

pivot_df

age_group,cmun,0-17,18-24,25-34,35-54,55+,total_population
0,1001,615,250,268,1005,827,2965
1,1002,1826,588,862,2961,4075,10312
2,1003,218,113,108,349,592,1380
3,1004,352,127,146,576,655,1856
4,1006,42,20,18,96,70,246
...,...,...,...,...,...,...,...
8127,50901,6,8,16,37,103,170
8128,50902,4,2,4,33,42,85
8129,50903,405,224,320,789,1114,2852
8130,51001,18151,8386,10785,23963,21894,83179


In [115]:
pivot_df_sex = df_demographics.pivot_table(
    index='cmun',
    columns='sex',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()

In [116]:
df_demographics["sex"].unique()

array(['Males', 'Females'], dtype=object)

In [117]:
sex_group_order = ['Males', 'Females']

pivot_df_sex = pivot_df_sex[['cmun'] + [grp for grp in sex_group_order if grp in pivot_df_sex.columns]]
pivot_df_sex.rename(columns={"Males": "male", "Females": "female"}, inplace=True)

pivot_df_sex["total_sex"] = pivot_df_sex["male"] + pivot_df_sex["female"]
pivot_df_sex

sex,cmun,male,female,total_sex
0,1001,1525,1440,2965
1,1002,5134,5178,10312
2,1003,709,671,1380
3,1004,914,942,1856
4,1006,127,119,246
...,...,...,...,...
8127,50901,104,66,170
8128,50902,43,42,85
8129,50903,1444,1408,2852
8130,51001,41957,41222,83179


In [120]:
df_demographics_combined = pivot_df.merge(df_demographics, on='cmun', how='left')
df_demographics_combined = pivot_df_sex.merge(df_demographics_combined, on='cmun', how='left')

df_demographics_combined


Unnamed: 0,cmun,male,female,total_sex,0-17,18-24,25-34,35-54,55+,total_population,province,municipality,year,municipality_clean,province_clean
0,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,2024,alegria-dulantzi,araba/alava
1,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,2024,alegria-dulantzi,araba/alava
2,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,2024,alegria-dulantzi,araba/alava
3,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,2024,alegria-dulantzi,araba/alava
4,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,2024,alegria-dulantzi,araba/alava
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,2024,melilla,melilla
1642660,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,2024,melilla,melilla
1642661,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,2024,melilla,melilla
1642662,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,2024,melilla,melilla


In [None]:
df_demographics_combined = df_demographics_combined.drop(["year", "municipality", "province", "total_sex"], axis = 1)



In [124]:
df_demographics_combined

Unnamed: 0,cmun,male,female,0-17,18-24,25-34,35-54,55+,total_population,municipality_clean,province_clean
0,1001,1525,1440,615,250,268,1005,827,2965,alegria-dulantzi,araba/alava
1,1001,1525,1440,615,250,268,1005,827,2965,alegria-dulantzi,araba/alava
2,1001,1525,1440,615,250,268,1005,827,2965,alegria-dulantzi,araba/alava
3,1001,1525,1440,615,250,268,1005,827,2965,alegria-dulantzi,araba/alava
4,1001,1525,1440,615,250,268,1005,827,2965,alegria-dulantzi,araba/alava
...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,43252,42733,21086,8706,11696,23407,21090,85985,melilla,melilla
1642660,52001,43252,42733,21086,8706,11696,23407,21090,85985,melilla,melilla
1642661,52001,43252,42733,21086,8706,11696,23407,21090,85985,melilla,melilla
1642662,52001,43252,42733,21086,8706,11696,23407,21090,85985,melilla,melilla


In [125]:
df_demographics_combined = df_demographics_combined[["cmun", "municipality_clean", "province_clean", "0-17", "18-24", "25-34", "35-54","55+", "male", "female","total_population"]]

In [126]:
df_demographics_combined

Unnamed: 0,cmun,municipality_clean,province_clean,0-17,18-24,25-34,35-54,55+,male,female,total_population
0,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
1,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
2,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
3,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
4,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642660,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642661,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642662,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985


In [127]:
df_demographics_combined = df_demographics_combined.drop_duplicates()
df_demographics_combined

Unnamed: 0,cmun,municipality_clean,province_clean,0-17,18-24,25-34,35-54,55+,male,female,total_population
0,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
202,1002,amurrio,araba/alava,1826,588,862,2961,4075,5134,5178,10312
404,1003,aramaio,araba/alava,218,113,108,349,592,709,671,1380
606,1004,artziniega,araba/alava,352,127,146,576,655,914,942,1856
808,1006,arminon,araba/alava,42,20,18,96,70,127,119,246
...,...,...,...,...,...,...,...,...,...,...,...
1641654,50901,biel,zaragoza,6,8,16,37,103,104,66,170
1641856,50902,marracos,zaragoza,4,2,4,33,42,43,42,85
1642058,50903,villamayor,zaragoza,405,224,320,789,1114,1444,1408,2852
1642260,51001,ceuta,ceuta,18151,8386,10785,23963,21894,41957,41222,83179


In [128]:
df_demographics_combined.to_csv("../data/processed/filtered_demographics.csv", index=False)