### Detailed structure (disaggregation) information.

In [6]:
import pandas as pd
import unidecode

### Brazil occupations (CBO)

In [7]:
#Load finest aggregation
cbo_ocu = pd.read_csv('data/disagg_struct_refs/sources/CBO2002 - Ocupacao.csv', delimiter = ';')
cbo_ocu = cbo_ocu.rename({'CODIGO': 'CBO ID O', 'TITULO': 'CBO label O'}, axis = 1).astype(str)
cbo_ocu['CBO ID O'] = cbo_ocu['CBO ID O'].str.zfill(6)

#Add codes of aggregated levels
cbo_ocu['CBO ID F'] = cbo_ocu['CBO ID O'].str[:-2]
cbo_ocu['CBO ID SG'] = cbo_ocu['CBO ID O'].str[:-3]
cbo_ocu['CBO ID SGP'] = cbo_ocu['CBO ID O'].str[:-4]
cbo_ocu['CBO ID GG'] = cbo_ocu['CBO ID O'].str[:-5]

#Load coarser aggregations
cbo_gg = pd.read_csv('data/disagg_struct_refs/sources/CBO2002 - Grande Grupo.csv', delimiter = ';').rename({'CODIGO': 'CBO ID GG', 'TITULO': 'CBO label GG'}, axis = 1).astype(str)
cbo_sgp = pd.read_csv('data/disagg_struct_refs/sources/CBO2002 - SubGrupo Principal.csv', delimiter = ';').rename({'CODIGO': 'CBO ID SGP', 'TITULO': 'CBO label SGP'}, axis = 1).astype(str)
cbo_sgp['CBO ID SGP'] = cbo_sgp['CBO ID SGP'].str.zfill(2)
cbo_sg = pd.read_csv('data/disagg_struct_refs/sources/CBO2002 - SubGrupo.csv', delimiter = ';').rename({'CODIGO': 'CBO ID SG', 'TITULO': 'CBO label SG'}, axis = 1).astype(str)
cbo_sg['CBO ID SG'] = cbo_sg['CBO ID SG'].str.zfill(3)
cbo_fam = pd.read_csv('data/disagg_struct_refs/sources/CBO2002 - Familia.csv', delimiter = ';').rename({'CODIGO': 'CBO ID F', 'TITULO': 'CBO label F'}, axis = 1).astype(str)
cbo_fam['CBO ID F'] = cbo_fam['CBO ID F'].str.zfill(4)

#Merge

CBO = cbo_ocu.merge(cbo_fam).merge(cbo_sg).merge(cbo_sgp).merge(cbo_gg)

# #Fix case
# for col in CBO.columns:
#     if 'label' in col:
#         CBO[col] = CBO[col].str.capitalize()

CBO['CBO ID 0'] = '0'; CBO['CBO label 0'] = 'All occupations'
    
for col in [col for col in CBO.columns if 'label' in col]:
    CBO[col] = [unidecode.unidecode(acc) for acc in CBO[col].str.decode('latin-1')]

        
# Save
CBO.to_csv('data/disagg_struct_refs/formatted/CBO_full.csv', index = False)

#Usage
CBO_agg_ref = CBO[['CBO ID O', 'CBO ID F', 'CBO ID SG', 'CBO ID SGP', 'CBO ID GG']]
CBO_agg_labels = CBO[['CBO ID O', 'CBO label O', 'CBO ID F', 'CBO label F', 'CBO ID SG', 'CBO label SG',
                      'CBO ID SGP', 'CBO label SGP', 'CBO ID GG', 'CBO label GG']]

### Chile industries (CIIU). Eq SITC

In [19]:
CIIU = pd.read_csv('data/disagg_struct_refs/sources/ciiu4.csv')
CIIU.columns = ['CIIU ID 2', 'CIIU ID 3', 'CIIU ID 4', 'CIIU label 4']
ciiu4 = CIIU.loc[CIIU['CIIU ID 4'].notnull()][['CIIU ID 4', 'CIIU label 4']]
ciiu4['CIIU ID 4'] = ciiu4['CIIU ID 4'].astype(int)
ciiu4['CIIU ID 2'] = ciiu4['CIIU ID 4'].astype(str).str[:-2].astype(int)

CIIU = ciiu4.merge(pd.read_csv('data/disagg_struct_refs/sources/ciiu0.csv'))

CIIU['CIIU ID 1'] = [ord(l)-64 for l in CIIU['CIIU ID 1'].values]
CIIU['CIIU ID 0'] = '0'; CIIU['CIIU label 0'] = 'All industries'

for col in [col for col in CIIU.columns if 'label' in col]:
    CIIU[col] = [unidecode.unidecode(acc).capitalize() for acc in CIIU[col].str.decode('utf8')]

CIIU[['CIIU ID 4', 'CIIU label 4','CIIU ID 2','CIIU ID 1', 'CIIU ID 0']].to_csv('data/disagg_struct_refs/formatted/CIIU_full.csv', index = False)

### Brazil industries (CIIU). Eq SITC

In [20]:
import numpy as np

#Industries
CNAE = pd.read_csv('data/disagg_struct_refs/sources/CNAE20_EstruturaDetalhada.csv', header = None)
CNAE.columns = ['Seção', 'Divisão' , 'Grupo', 'Classe', 'Denominação']
CNAE.columns = ['CNAE ID S', 'CNAE ID D' , 'CNAE ID G', 'CNAE ID C', 'CNAE label']

#remove spurious entries

# for string in CNAE.columns:
#     CNAE.replace({string: np.NaN}, inplace=True)
CNAE.replace({'Seção': np.NaN}, inplace=True)
CNAE.replace({'Divisão': np.NaN}, inplace=True)
CNAE.replace({'Grupo': np.NaN}, inplace=True)
CNAE.replace({'Classe': np.NaN}, inplace=True)
CNAE.replace({'Denominação': np.NaN}, inplace=True)
CNAE.replace({'(continua)': np.NaN}, inplace=True)
CNAE.replace({'2.2 - Estrutura detalhada da CNAE 2.0: Códigos e denominações': np.NaN}, inplace=True)

# Fix format, by removing '.' and '-' symbols
CNAE['CNAE ID G'] = CNAE['CNAE ID G'].str.replace('.', '')
CNAE['CNAE ID C'] = CNAE.fillna(method='ffill')['CNAE ID S'].str.lower() + CNAE['CNAE ID C'].str.replace('.', '').str.replace('-', '')

CNAE_cla = CNAE.fillna(method='ffill').loc[CNAE['CNAE ID C'].notnull()].rename({'CNAE label': 'CNAE label C'}, axis = 1)
CNAE_gru = CNAE.loc[CNAE['CNAE ID G'].notnull()][['CNAE ID G', 'CNAE label']].rename({'CNAE label': 'CNAE label G'}, axis = 1)
CNAE_div = CNAE.loc[CNAE['CNAE ID D'].notnull()][['CNAE ID D', 'CNAE label']].rename({'CNAE label': 'CNAE label D'}, axis = 1)
CNAE_sec = CNAE.loc[CNAE['CNAE ID S'].notnull()][['CNAE ID S', 'CNAE label']].rename({'CNAE label': 'CNAE label S'}, axis = 1)

#Merge

CNAE = CNAE_cla.merge(CNAE_gru).merge(CNAE_div).merge(CNAE_sec)

#Fix case
for col in CNAE.columns:
    if 'label' in col:
        CNAE[col] = CNAE[col].str.capitalize()
        
# Add 'total'
CNAE['CNAE ID 0'] = '0'; CNAE['CNAE label 0'] = 'All industries'

# Remove diacritics from text
for col in [col for col in CNAE.columns if 'label' in col]:
    CNAE[col] = [unidecode.unidecode(acc).capitalize() for acc in CNAE[col].str.decode('utf8')]

    
#Save
CNAE.to_csv('data/disagg_struct_refs/formatted/CNAE_full.csv', index = False)

#Usage
CNAE_agg_ref = CNAE[['CNAE ID C', 'CNAE ID G', 'CNAE ID D', 'CNAE ID S']]
CNAE_agg_labels = CNAE[['CNAE ID C', 'CNAE label C', 'CNAE ID G', 'CNAE label G', 'CNAE ID D', 'CNAE label D',
                      'CNAE ID S', 'CNAE label S']]