In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [None]:
df_raw = pd.read_csv("../data/raw/68542.csv", sep=";", decimal=",")

In [None]:
df_raw.head()

In [None]:
df_raw.info()

In [None]:
df_raw.sample(15)

In [None]:
df_raw.drop(columns=['National Total'], inplace=True)

In [None]:
df_raw.isnull().sum()

In [None]:
df_municipalities_null = df_raw[df_raw['Municipalities'].isnull()]
df_municipalities_null[df_municipalities_null["Provinces"] == "08 Barcelona"]

### We have information about provinces without the municipalities, we should drop them

In [None]:
df_raw[df_raw['Provinces'] == "25 Lleida"].sample(5)

In [None]:

prov_null = df_raw[["Provinces", "Municipalities"]]

prov_null[(prov_null["Provinces"].isnull()) & (prov_null["Municipalities"].isnull())]

In [None]:

prov_null[(prov_null["Provinces"].isnull())]

In [None]:
df_raw[df_raw['Total'].isnull()].sample(30)

In [None]:
df_raw.sample(20)

In [None]:
df_raw = df_raw.dropna(subset=["Provinces", "Municipalities", "Total"], how="any")

In [None]:
df_2024 = df_raw[df_raw['Periodo'] == 2024].copy()

In [None]:
df_2024.reset_index(drop=True, inplace=True)

In [None]:
idx = pd.Index(["Provinces", "Municipalities", "Sex", "Age", "Periodo", "Total"])
eng = pd.Index(["province", "municipality", "sex", "age", "year", "total"])

df_2024.rename(columns=dict(zip(idx, eng)), inplace=True)

### Get the correct types

In [None]:
df_2024["total"] = pd.to_numeric(df_2024["total"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
df_2024 = df_2024[df_2024["age"] != "All ages"]

In [None]:
(df_2024["age"].str.contains(r"^\d"))

In [None]:
df_2024.head()

In [None]:
from scripts.utils import split_column_at
df_2024["age"] = split_column_at(df_2024, "age", " ", index=0)
df_2024["province"] = split_column_at(df_2024, "province", " ", index=1)
df_2024["cprov"] = split_column_at(df_2024, "province", " ", index=0)
df_2024["cmun"] = split_column_at(df_2024, "municipality", " ", index=0)
df_2024["municipality"] = split_column_at(df_2024, "municipality", " ", index=1)

In [None]:
df_2024

In [None]:
df_2024.drop("cprov", axis = 1, inplace=True)

In [None]:
df_2024["age"] = pd.to_numeric(df_2024["age"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
cleaner = AccentCleaner([df_2024], ['municipality', 'province'])
cleaner.cleanAccents()

In [None]:
df_2024.sample(10)

In [None]:
df_2024["province"] = split_column_at(df_2024, "province", " ", index=0)

In [None]:
df_2024["cmun"] = pd.to_numeric(df_2024["cmun"].str.replace(".", "", regex=False)).astype("int32")

In [None]:
df_2024.sample(10)

In [None]:
df_2024.to_csv("../data/large_files/filtered_age.csv", index=False)

## Grouping


In [None]:
df_ages_bined = pd.read_csv("../data/large_files/filtered_age.csv")

df_ages_bined.shape
df_ages_bined.head()

In [None]:
df_demographics = df_ages_bined.query("sex != 'Total'")

In [None]:
df_demographics.isna().sum()

In [None]:
def assign_age_group(age):
    if age <= 17:
        return '0-17'
    elif age <= 24:
        return '18-24'
    elif age <= 34:
        return '25-34'
    elif age <= 54:
        return '35-54'
    else:
        return '55+'

df_demographics['age_group'] = df_demographics['age'].apply(assign_age_group)

In [None]:
df_demographics["age_group"].unique()

In [None]:
# 2. Pivot the table to have one column per age group
#    We group by 'cmun' (or whichever municipality identifier you prefer)
pivot_df = df_demographics.pivot_table(
    index='cmun',
    columns='age_group',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()


In [None]:
age_group_order = ['0-17', '18-24', '25-34', '35-54', '55+']
pivot_df = pivot_df[['cmun'] + [grp for grp in age_group_order if grp in pivot_df.columns]]
pivot_df["total_population"] = pivot_df["0-17"] + pivot_df["18-24"] + pivot_df["25-34"] + pivot_df["35-54"] + pivot_df["55+"]

pivot_df

In [None]:
pivot_df_sex = df_demographics.pivot_table(
    index='cmun',
    columns='sex',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()

In [None]:
df_demographics["sex"].unique()

In [None]:
sex_group_order = ['Males', 'Females']

pivot_df_sex = pivot_df_sex[['cmun'] + [grp for grp in sex_group_order if grp in pivot_df_sex.columns]]
pivot_df_sex.rename(columns={"Males": "male", "Females": "female"}, inplace=True)

pivot_df_sex["total_sex"] = pivot_df_sex["male"] + pivot_df_sex["female"]
pivot_df_sex

In [None]:
df_demographics_combined = pivot_df.merge(df_demographics, on='cmun', how='left')
df_demographics_combined = pivot_df_sex.merge(df_demographics_combined, on='cmun', how='left')

df_demographics_combined


In [None]:
df_demographics_combined = df_demographics_combined.drop(["year", "municipality", "province", "total_sex"], axis = 1)



In [None]:
df_demographics_combined

In [None]:
df_demographics_combined = df_demographics_combined[["cmun", "municipality_clean", "province_clean", "0-17", "18-24", "25-34", "35-54","55+", "male", "female","total_population"]]

In [None]:
df_demographics_combined

In [None]:
df_demographics_combined = df_demographics_combined.drop_duplicates()
df_demographics_combined

In [None]:
df_demographics_combined.to_csv("../data/processed/filtered_demographics.csv", index=False)