In [22]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

# First Part: Data Cleanup

In [23]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df_raw.head()

In [None]:
df_raw.info()

In [None]:
df_raw.sample(15)

In [None]:
df_raw.drop(columns=['National Total'], inplace=True)

In [None]:
df_raw.isnull().sum()

In [None]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

### We have information about provinces without the municipalities, we should drop them

In [None]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

In [None]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

In [None]:

prov_null[(prov_null["Provinces"].isnull())]

In [None]:
df_raw[df_raw['Total'].isnull()].sample(30)

In [None]:
df_raw.sample(20)

In [None]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [None]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [None]:
df_2024.reset_index(drop=True, inplace=True)

In [None]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [None]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [None]:
(df_2024["age"].str.contains(r"^\d"))

In [None]:
df_2024.head()

In [None]:
from scripts.utils import split_column_at
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)

In [None]:
df_2024

In [None]:
df_2024.drop("cprov", axis = 1, inplace=True)

In [None]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [None]:
df_2024.sample(10)

In [None]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [None]:
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
df_2024.sample(10)

In [None]:
df_2024.to_csv("../data/large_files/filtered_age.csv", index=False)

# Second Part: Binning and Pivoting


In [None]:
df_ages_bined = pd.read_csv("../data/large_files/filtered_age.csv")

df_ages_bined.shape
df_ages_bined.head()

Unnamed: 0,province,municipality,sex,age,year,total,cmun,municipality_clean,province_clean
0,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,1001,alegria-dulantzi,araba/alava
1,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,1001,alegria-dulantzi,araba/alava
2,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,1001,alegria-dulantzi,araba/alava
3,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,1001,alegria-dulantzi,araba/alava
4,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,1001,alegria-dulantzi,araba/alava


In [None]:
df_demographics = df_ages_bined.query("sex != 'Total'")

In [None]:
df_demographics.isna().sum()

province                0
municipality          202
sex                     0
age                     0
year                    0
total                   0
cmun                    0
municipality_clean      0
province_clean          0
dtype: int64

In [None]:
def assign_age_group(age):
    if age <= 17:
        return '0-17'
    elif age <= 24:
        return '18-24'
    elif age <= 34:
        return '25-34'
    elif age <= 54:
        return '35-54'
    else:
        return '55+'

df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)


In [None]:
df_demographics["age_group"].unique()

array(['0-17', '18-24', '25-34', '35-54', '55+'], dtype=object)

In [None]:
# 2. Pivot the table to have one column per age group
#    We group by 'cmun' (or whichever municipality identifier you prefer)
pivot_df = df_demographics.pivot_table(
    index='cmun',
    columns='age_group',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()


In [None]:
age_group_order = ['0-17', '18-24', '25-34', '35-54', '55+']
pivot_df = pivot_df[['cmun'] + [grp for grp in age_group_order if grp in pivot_df.columns]]
pivot_df["total_population"] = pivot_df["0-17"] + pivot_df["18-24"] + pivot_df["25-34"] + pivot_df["35-54"] + pivot_df["55+"]

pivot_df

age_group,cmun,0-17,18-24,25-34,35-54,55+,total_population
0,1001,615,250,268,1005,827,2965
1,1002,1826,588,862,2961,4075,10312
2,1003,218,113,108,349,592,1380
3,1004,352,127,146,576,655,1856
4,1006,42,20,18,96,70,246
...,...,...,...,...,...,...,...
8127,50901,6,8,16,37,103,170
8128,50902,4,2,4,33,42,85
8129,50903,405,224,320,789,1114,2852
8130,51001,18151,8386,10785,23963,21894,83179


In [None]:
pivot_df_sex = df_demographics.pivot_table(
    index='cmun',
    columns='sex',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()

In [None]:
df_demographics["sex"].unique()

array(['Males', 'Females'], dtype=object)

In [None]:
sex_group_order = ['Males', 'Females']

pivot_df_sex = pivot_df_sex[['cmun'] + [grp for grp in sex_group_order if grp in pivot_df_sex.columns]]
pivot_df_sex.rename(columns={"Males": "male", "Females": "female"}, inplace=True)

pivot_df_sex["total_sex"] = pivot_df_sex["male"] + pivot_df_sex["female"]
pivot_df_sex

sex,cmun,male,female,total_sex
0,1001,1525,1440,2965
1,1002,5134,5178,10312
2,1003,709,671,1380
3,1004,914,942,1856
4,1006,127,119,246
...,...,...,...,...
8127,50901,104,66,170
8128,50902,43,42,85
8129,50903,1444,1408,2852
8130,51001,41957,41222,83179


In [None]:
df_demographics_combined = pivot_df.merge(df_demographics, on='cmun', how='left')
df_demographics_combined = pivot_df_sex.merge(df_demographics_combined, on='cmun', how='left')

df_demographics_combined


Unnamed: 0,cmun,male,female,total_sex,0-17,18-24,25-34,35-54,55+,total_population,province,municipality,sex,age,year,total,municipality_clean,province_clean,age_group
0,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,0,2024,11,alegria-dulantzi,araba/alava,0-17
1,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,1,2024,9,alegria-dulantzi,araba/alava,0-17
2,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,2,2024,15,alegria-dulantzi,araba/alava,0-17
3,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,3,2024,12,alegria-dulantzi,araba/alava,0-17
4,1001,1525,1440,2965,615,250,268,1005,827,2965,Araba/Álava,Alegría-Dulantzi,Males,4,2024,9,alegria-dulantzi,araba/alava,0-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,96,2024,10,melilla,melilla,55+
1642660,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,97,2024,11,melilla,melilla,55+
1642661,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,98,2024,9,melilla,melilla,55+
1642662,52001,43252,42733,85985,21086,8706,11696,23407,21090,85985,Melilla,Melilla,Females,99,2024,1,melilla,melilla,55+


In [None]:
df_demographics_combined = df_demographics_combined.drop(["year", "municipality", "province", "total_sex"], axis = 1)



In [None]:
df_demographics_combined

Unnamed: 0,cmun,male,female,0-17,18-24,25-34,35-54,55+,total_population,sex,age,total,municipality_clean,province_clean,age_group
0,1001,1525,1440,615,250,268,1005,827,2965,Males,0,11,alegria-dulantzi,araba/alava,0-17
1,1001,1525,1440,615,250,268,1005,827,2965,Males,1,9,alegria-dulantzi,araba/alava,0-17
2,1001,1525,1440,615,250,268,1005,827,2965,Males,2,15,alegria-dulantzi,araba/alava,0-17
3,1001,1525,1440,615,250,268,1005,827,2965,Males,3,12,alegria-dulantzi,araba/alava,0-17
4,1001,1525,1440,615,250,268,1005,827,2965,Males,4,9,alegria-dulantzi,araba/alava,0-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,96,10,melilla,melilla,55+
1642660,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,97,11,melilla,melilla,55+
1642661,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,98,9,melilla,melilla,55+
1642662,52001,43252,42733,21086,8706,11696,23407,21090,85985,Females,99,1,melilla,melilla,55+


In [None]:
df_demographics_combined = df_demographics_combined[["cmun", "municipality_clean", "province_clean", "0-17", "18-24", "25-34", "35-54","55+", "male", "female","total_population"]]

In [None]:
df_demographics_combined

Unnamed: 0,cmun,municipality_clean,province_clean,0-17,18-24,25-34,35-54,55+,male,female,total_population
0,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
1,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
2,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
3,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
4,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
...,...,...,...,...,...,...,...,...,...,...,...
1642659,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642660,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642661,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985
1642662,52001,melilla,melilla,21086,8706,11696,23407,21090,43252,42733,85985


In [None]:
df_demographics_combined = df_demographics_combined.drop_duplicates()
df_demographics_combined

Unnamed: 0,cmun,municipality_clean,province_clean,0-17,18-24,25-34,35-54,55+,male,female,total_population
0,1001,alegria-dulantzi,araba/alava,615,250,268,1005,827,1525,1440,2965
202,1002,amurrio,araba/alava,1826,588,862,2961,4075,5134,5178,10312
404,1003,aramaio,araba/alava,218,113,108,349,592,709,671,1380
606,1004,artziniega,araba/alava,352,127,146,576,655,914,942,1856
808,1006,arminon,araba/alava,42,20,18,96,70,127,119,246
...,...,...,...,...,...,...,...,...,...,...,...
1641654,50901,biel,zaragoza,6,8,16,37,103,104,66,170
1641856,50902,marracos,zaragoza,4,2,4,33,42,43,42,85
1642058,50903,villamayor,zaragoza,405,224,320,789,1114,1444,1408,2852
1642260,51001,ceuta,ceuta,18151,8386,10785,23963,21894,41957,41222,83179


In [None]:
df_demographics_combined.to_csv("../data/processed/filtered_demographics.csv", index=False)