In [None]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [None]:
df_industry = pd.read_csv("../data/raw/industry.csv", sep=";")

In [None]:
df_industry.info()

In [None]:
df_industry.isna().sum()

In [None]:
df_industry.rename(columns={"Totales Territoriales": "territory_total", 
                   "Provincias": "province",
                   "Municipios": "municipality", 
                   "Grupos CNAE": "sector",
                   "Periodo": "period", 
                   "Total": "total"},
                    inplace=True)

In [None]:
df_industry

In [None]:
df_industry = df_industry.query("period == 2024")

In [None]:
df_industry.info()

In [None]:
df_industry.isna().sum()

In [None]:
df_industry["territory_total"].unique()

In [None]:
df_industry = df_industry.drop(["territory_total", "period"], axis=1)

In [None]:
df_industry["sector"].unique()

In [None]:
df_industry = df_industry.query("sector != 'Total servicios'")

In [None]:
df_industry = df_industry.query("sector != 'Total'")

In [None]:
translations = {
    'B_E Industria': 'n_industry',
    'F Construcción': 'n_construction',
    'Comercio, transporte y hostelería': 'n_trade_transport_hospitality',
    'J Información y comunicaciones': 'n_info_communications',
    'K Actividades financieras y de seguros': 'n_financial_insurance',
    'L Actividades inmobiliarias': 'n_real_estate',
    'Actividades profesionales y técnicas': 'n_professional_technical',
    'Educación, sanidad y servicios sociales': 'n_eduation_health_social',
    'Otros servicios personales': 'n_other',
}


In [None]:
df_industry['sector'] = df_industry['sector'].map(translations)

In [None]:
df_industry.isna().sum()

In [None]:
df_industry.info()

In [None]:
df_industry

In [None]:
df_industry = df_industry.dropna(subset=['municipality'])

In [None]:
df_industry.dropna(subset=['total'])

In [None]:
df_industry[['cod_prov', 'province']] = df_industry['province'].str.split(' ', n=1, expand=True)
df_industry

In [None]:
df_industry[['cmun', 'municipality']] = df_industry['municipality'].str.split(' ', n=1, expand=True)

In [None]:
df_industry = df_industry[["cmun","municipality","province","sector", "total"]]

In [None]:
df_industry.query("total == '.'")
df_industry.loc[df_industry['total'] == '.', 'total'] = '0'


In [None]:
df_industry["sector"].unique()

In [None]:
pivot_df = df_industry.pivot_table(
    index='cmun',
    columns='sector',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()


In [None]:
industry_group_order = ['n_industry', 'n_construction', 'N_trade_transport_hospitality',
       'n_info_communications', 'n_financial_insurance', 'n_real_estate',
       'n_professional_technical', 'n_eduation_health_social', 'n_other']
pivot_df = pivot_df[['cmun'] + [grp for grp in industry_group_order if grp in pivot_df.columns]]


In [None]:
pivot_df

In [None]:
pivot_df.isna().sum()

In [None]:
df_industry.drop(["sector", "total"], axis=1, inplace=True)

In [None]:
df_industry= df_industry.drop_duplicates()

In [None]:
df_industry

In [None]:
df_industry = df_industry.merge(pivot_df, how ="left")
df_industry.head(20)

In [None]:
df_industry.drop(["municipality", "province"], axis = 1, inplace=True)

In [None]:
df_industry.to_csv("../data/processed/filtered_industry.csv", index=False)