In [None]:
# CAGED (Cadastro Geral de Empregados e Desempregados)
# name:         caged_xlsx_to_csv
# date:         29/03/2022 - 2022
# description:  caged xlsx file data processed, optimized and saved into csv file 

import pandas as pd

In [None]:
# creates panda database with all file sheets               - dfb
# creates panda dataframe with specific sheets              - df
# creates /path/files names for getting and saving data     - file_in, file_out

# files
file_in = '3-tabelas.xlsx'
file_out = 'caged.csv'

# skip initial undesireables rows
skiprows = 4  # [0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0]

dfb = pd.read_excel(file_in, sheet_name=None, skiprows=skiprows)
df = dfb['Tabela 5.1']
df


In [None]:
# optimizes data mensal evolution by dates with Saldos

months_names_br = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho',
                   'Julho', 'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']
month_names_dec = ['01',      '02',        '03',    '04',    '05',
                   '06',    '07',    '08',     '09',       '10',      '11',       '12']

# prepare dict variable to the replacements
mbr2mdec = dict(zip(months_names_br, month_names_dec))

# copying df for reutilizing
df2 = df.copy()

# selecting date cells only
df2 = df2[(df2['Mês'].str.slice(-4) >= '1900') &
          (df2['Mês'].str.slice(-4) <= '2099')]

# months name to months numbers replacements
df2['Mês'] = df2['Mês'].replace(mbr2mdec, regex=True)
df2['Mês'] = df2['Mês'].str.replace('/', '-')
df2['Mês'] = pd.to_datetime(
    df2['Mês'], format='%Y-%m-%d', infer_datetime_format=True)

# formatting 'Saldos' column
df2['Saldos'] = df2['Saldos'].round().astype(int)

# optimzes the dataframe
df2.rename(columns={'Mês': '', 'Saldos': 'total'}, inplace=True)
#df2[['', 'total']].to_csv(file_out, index=False)
first_opt_df = df2[['', 'total']]
first_opt_df


In [None]:
# creates panda dataframe with specific sheets  - df

df = pd.read_excel(file_in, sheet_name=11, skiprows=4, header=[0, 1])
df


In [None]:
# optimizes data mensal evolution by dates and regions with Saldos

regions_table = ('Região e UF', 'Unnamed: 1_level_1')
regions_names = ['Norte', 'Nordeste', 'Sudeste',
                 'Sul', 'Centro-Oeste', 'Não identificado']

# copying df for reutilizing
df2 = df.copy()


# function to create a dataframe columns list to use
def createColumnsList(param_df):  
    # column_name has 2 parts (date, info_type)
    columns_list = []
    column_names = param_df.columns.values.tolist()
    for column_name in column_names:
        col_date, col_info = column_name
        month = col_date.split('/')[0]
        if month in months_names_br:
            if col_info.lower().find('saldos') != -1:
                columns_list.append(column_name)

    return(columns_list)

regions_list = createColumnsList(df2)

# optimzes the dataframe
df3 = df2[df2[regions_table].isin(regions_names)]
df3[regions_list].to_csv(file_out, index=False)
second_opt_df = df3[regions_list]
second_opt_df = second_opt_df.transpose()
second_opt_df


In [None]:
# creates panda dataframe with specific sheets  - df

df = pd.read_excel(file_in, sheet_name=9, skiprows=4, header=[0, 1])
df

In [32]:
# optimizes data mensal evolution by dates and sectors with Saldos

gr_act_econ_table = (
    'Grupamento de Atividades Econômicas e Seção CNAE 2.0', 'Unnamed: 1_level_1')
sector_names = ['Agricultura, pecuária, produção florestal, pesca e aquicultura', 'Indústria geral', 'Construção',
                'Comércio; reparação de veículos automotores e motocicletas', 'Serviços', 'Não identificado***']

# copying df for reutilizing
df2 = df.copy()

sectors_list = createColumnsList(df2)

# optimzes the dataframe
df3 = df2[df2[gr_act_econ_table].isin(sector_names)]
df3[sectors_list].to_csv(file_out, index=False)
third_opt_df = df3[sectors_list]
third_opt_df = third_opt_df.transpose()
third_opt_df


Unnamed: 0,Unnamed: 1,1,2,7,8,9,27
Janeiro/2020,Saldos,16485.0,58261.0,35866.0,-51726.0,53071.0,0.0
Fevereiro/2020,Saldos,3666.0,39971.0,25990.0,9053.0,138528.0,-1.0
Março/2020,Saldos,-8575.0,-42788.0,-17646.0,-89225.0,-136821.0,0.0
Abril/2020,Saldos,-6833.0,-217738.0,-75300.0,-265612.0,-416351.0,0.0
Maio/2020,Saldos,14792.0,-107815.0,-23958.0,-106402.0,-175059.0,-1.0
Junho/2020,Saldos,37710.0,-8936.0,15568.0,-27560.0,-70816.0,0.0
Julho/2020,Saldos,22470.0,49444.0,40722.0,20805.0,-25031.0,0.0
Agosto/2020,Saldos,8025.0,90198.0,50520.0,43706.0,22074.0,0.0
Setembro/2020,Saldos,5321.0,111316.0,47379.0,66418.0,68939.0,0.0
Outubro/2020,Saldos,-4501.0,84138.0,35035.0,110621.0,140726.0,0.0
