In [424]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import plotly.express as px
import numpy as np

from scripts.accent_cleaner import AccentCleaner
from scripts.column_aligner import ColumnAligner
from scripts.utils import split_at_char, replace_with

In [425]:
df_industry = pd.read_csv("../data/large_files/industry.csv", sep=";")

In [426]:
df_industry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171027 entries, 0 to 1171026
Data columns (total 6 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   Totales Territoriales  1171027 non-null  object
 1   Provincias             1170884 non-null  object
 2   Municipios             1163448 non-null  object
 3   Grupos CNAE            1171027 non-null  object
 4   Periodo                1171027 non-null  int64 
 5   Total                  380174 non-null   object
dtypes: int64(1), object(5)
memory usage: 53.6+ MB


In [427]:
df_industry.isna().sum()

Totales Territoriales         0
Provincias                  143
Municipios                 7579
Grupos CNAE                   0
Periodo                       0
Total                    790853
dtype: int64

In [428]:
df_industry.rename(columns={"Totales Territoriales": "territory_total", 
                   "Provincias": "province",
                   "Municipios": "municipality", 
                   "Grupos CNAE": "sector",
                   "Periodo": "period", 
                   "Total": "total"},
                    inplace=True)

In [429]:
df_industry

Unnamed: 0,territory_total,province,municipality,sector,period,total
0,Total Nacional,,,Total,2024,3.255.276
1,Total Nacional,,,Total,2023,3.207.580
2,Total Nacional,,,Total,2022,3.430.663
3,Total Nacional,,,Total,2021,3.366.570
4,Total Nacional,,,Total,2020,3.404.428
...,...,...,...,...,...,...
1171022,Total Nacional,52 Melilla,52001 Melilla,Total servicios,2016,1.414
1171023,Total Nacional,52 Melilla,52001 Melilla,Total servicios,2015,1.339
1171024,Total Nacional,52 Melilla,52001 Melilla,Total servicios,2014,1.226
1171025,Total Nacional,52 Melilla,52001 Melilla,Total servicios,2013,1.244


In [430]:
df_industry = df_industry.query("period == 2024")

In [431]:
df_industry.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90079 entries, 0 to 1171014
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   territory_total  90079 non-null  object
 1   province         90068 non-null  object
 2   municipality     89496 non-null  object
 3   sector           90079 non-null  object
 4   period           90079 non-null  int64 
 5   total            29195 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.8+ MB


In [432]:
df_industry.isna().sum()

territory_total        0
province              11
municipality         583
sector                 0
period                 0
total              60884
dtype: int64

In [433]:
df_industry["territory_total"].unique()

array(['Total Nacional'], dtype=object)

In [434]:
df_industry = df_industry.drop(["territory_total", "period"], axis=1)

In [435]:
df_industry["sector"].unique()

array(['Total', 'B_E Industria', 'F Construcción',
       'Comercio, transporte y hostelería',
       'J Información y comunicaciones',
       'K Actividades financieras y de seguros',
       'L Actividades inmobiliarias',
       'Actividades profesionales y técnicas',
       'Educación, sanidad y servicios sociales',
       'Otros servicios personales', 'Total servicios'], dtype=object)

In [436]:
df_industry = df_industry.query("sector != 'Total servicios'")

In [437]:
df_industry = df_industry.query("sector != 'Total'")

In [438]:
translations = {
    'B_E Industria': 'n_industry',
    'F Construcción': 'n_construction',
    'Comercio, transporte y hostelería': 'n_trade_transport_hospitality',
    'J Información y comunicaciones': 'n_info_communications',
    'K Actividades financieras y de seguros': 'n_financial_insurance',
    'L Actividades inmobiliarias': 'n_real_estate',
    'Actividades profesionales y técnicas': 'n_professional_technical',
    'Educación, sanidad y servicios sociales': 'n_eduation_health_social',
    'Otros servicios personales': 'n_other',
}


In [439]:
df_industry['sector'] = df_industry['sector'].map(translations)

In [440]:
df_industry.isna().sum()

province            9
municipality      477
sector              0
total           55890
dtype: int64

In [441]:
df_industry.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73701 entries, 13 to 1171001
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   province      73692 non-null  object
 1   municipality  73224 non-null  object
 2   sector        73701 non-null  object
 3   total         17811 non-null  object
dtypes: object(4)
memory usage: 2.8+ MB


In [442]:
df_industry

Unnamed: 0,province,municipality,sector,total
13,,,n_industry,175.806
26,,,n_construction,383.622
39,,,n_trade_transport_hospitality,1.104.814
52,,,n_info_communications,71.979
65,,,n_financial_insurance,76.243
...,...,...,...,...
1170949,52 Melilla,52001 Melilla,n_financial_insurance,46
1170962,52 Melilla,52001 Melilla,n_real_estate,100
1170975,52 Melilla,52001 Melilla,n_professional_technical,616
1170988,52 Melilla,52001 Melilla,n_eduation_health_social,389


In [443]:
df_industry = df_industry.dropna(subset=['municipality'])

In [444]:
df_industry.dropna(subset=['total'])

Unnamed: 0,province,municipality,sector,total
299,01 Araba/Álava,01001 Alegría-Dulantzi,n_industry,18
312,01 Araba/Álava,01001 Alegría-Dulantzi,n_construction,30
325,01 Araba/Álava,01001 Alegría-Dulantzi,n_trade_transport_hospitality,51
442,01 Araba/Álava,01002 Amurrio,n_industry,63
455,01 Araba/Álava,01002 Amurrio,n_construction,89
...,...,...,...,...
1170949,52 Melilla,52001 Melilla,n_financial_insurance,46
1170962,52 Melilla,52001 Melilla,n_real_estate,100
1170975,52 Melilla,52001 Melilla,n_professional_technical,616
1170988,52 Melilla,52001 Melilla,n_eduation_health_social,389


In [445]:
df_industry[['cod_prov', 'province']] = df_industry['province'].str.split(' ', n=1, expand=True)
df_industry

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_industry[['cod_prov', 'province']] = df_industry['province'].str.split(' ', n=1, expand=True)


Unnamed: 0,province,municipality,sector,total,cod_prov
299,Araba/Álava,01001 Alegría-Dulantzi,n_industry,18,01
312,Araba/Álava,01001 Alegría-Dulantzi,n_construction,30,01
325,Araba/Álava,01001 Alegría-Dulantzi,n_trade_transport_hospitality,51,01
338,Araba/Álava,01001 Alegría-Dulantzi,n_info_communications,,01
351,Araba/Álava,01001 Alegría-Dulantzi,n_financial_insurance,,01
...,...,...,...,...,...
1170949,Melilla,52001 Melilla,n_financial_insurance,46,52
1170962,Melilla,52001 Melilla,n_real_estate,100,52
1170975,Melilla,52001 Melilla,n_professional_technical,616,52
1170988,Melilla,52001 Melilla,n_eduation_health_social,389,52


In [446]:
df_industry[['cmun', 'municipality']] = df_industry['municipality'].str.split(' ', n=1, expand=True)

df_industry

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_industry[['cmun', 'municipality']] = df_industry['municipality'].str.split(' ', n=1, expand=True)


Unnamed: 0,province,municipality,sector,total,cod_prov,cmun
299,Araba/Álava,Alegría-Dulantzi,n_industry,18,01,01001
312,Araba/Álava,Alegría-Dulantzi,n_construction,30,01,01001
325,Araba/Álava,Alegría-Dulantzi,n_trade_transport_hospitality,51,01,01001
338,Araba/Álava,Alegría-Dulantzi,n_info_communications,,01,01001
351,Araba/Álava,Alegría-Dulantzi,n_financial_insurance,,01,01001
...,...,...,...,...,...,...
1170949,Melilla,Melilla,n_financial_insurance,46,52,52001
1170962,Melilla,Melilla,n_real_estate,100,52,52001
1170975,Melilla,Melilla,n_professional_technical,616,52,52001
1170988,Melilla,Melilla,n_eduation_health_social,389,52,52001


In [447]:
df_industry = df_industry[["cmun","municipality","province","sector", "total"]]

In [448]:
df_industry

Unnamed: 0,cmun,municipality,province,sector,total
299,01001,Alegría-Dulantzi,Araba/Álava,n_industry,18
312,01001,Alegría-Dulantzi,Araba/Álava,n_construction,30
325,01001,Alegría-Dulantzi,Araba/Álava,n_trade_transport_hospitality,51
338,01001,Alegría-Dulantzi,Araba/Álava,n_info_communications,
351,01001,Alegría-Dulantzi,Araba/Álava,n_financial_insurance,
...,...,...,...,...,...
1170949,52001,Melilla,Melilla,n_financial_insurance,46
1170962,52001,Melilla,Melilla,n_real_estate,100
1170975,52001,Melilla,Melilla,n_professional_technical,616
1170988,52001,Melilla,Melilla,n_eduation_health_social,389


In [449]:
df_industry["sector"].unique()

array(['n_industry', 'n_construction', 'n_trade_transport_hospitality',
       'n_info_communications', 'n_financial_insurance', 'n_real_estate',
       'n_professional_technical', 'n_eduation_health_social', 'n_other'],
      dtype=object)

In [450]:
pivot_df = df_industry.pivot_table(
    index='cmun',
    columns='sector',
    values='total',
    aggfunc='sum',
    fill_value=0  # fill missing combinations with 0
).reset_index()


In [451]:
industry_group_order = ['n_industry', 'n_construction', 'N_trade_transport_hospitality',
       'n_info_communications', 'n_financial_insurance', 'n_real_estate',
       'n_professional_technical', 'n_eduation_health_social', 'n_other']
pivot_df = pivot_df[['cmun'] + [grp for grp in industry_group_order if grp in pivot_df.columns]]


In [452]:
pivot_df

sector,cmun,n_industry,n_construction,n_info_communications,n_financial_insurance,n_real_estate,n_professional_technical,n_eduation_health_social,n_other
0,01001,18,30,0,0,0,0,0,0
1,01002,63,89,7,10,5,86,46,51
2,01003,6,4,0,0,0,0,0,0
3,01004,5,14,0,0,0,0,0,0
4,01006,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
8131,50901,0,0,0,0,0,0,0,0
8132,50902,0,0,0,0,0,0,0,0
8133,50903,4,18,0,0,0,0,0,0
8134,51001,62,260,48,35,115,496,303,446


In [453]:
pivot_df.isna().sum()

sector
cmun                        0
n_industry                  0
n_construction              0
n_info_communications       0
n_financial_insurance       0
n_real_estate               0
n_professional_technical    0
n_eduation_health_social    0
n_other                     0
dtype: int64

In [454]:
df_industry.drop(["sector", "total"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_industry.drop(["sector", "total"], axis=1, inplace=True)


In [455]:
df_industry= df_industry.drop_duplicates()

In [456]:
df_industry

Unnamed: 0,cmun,municipality,province
299,01001,Alegría-Dulantzi,Araba/Álava
442,01002,Amurrio,Araba/Álava
585,01003,Aramaio,Araba/Álava
728,01004,Artziniega,Araba/Álava
871,01006,Armiñón,Araba/Álava
...,...,...,...
1170039,50901,Biel,Zaragoza
1170182,50902,Marracos,Zaragoza
1170325,50903,Villamayor de Gállego,Zaragoza
1170611,51001,Ceuta,Ceuta


In [457]:
df_industry = df_industry.merge(pivot_df, how ="left")

In [459]:
df_industry.drop(["municipality", "province"], axis = 1, inplace=True)

In [460]:
df_industry.to_csv("../data/processed/filtered_industry.csv", index=False)