In [None]:
# CAGED (Cadastro Geral de Empregados e Desempregados)
# name:         caged_xlsx_to_csv
# date:         29/03/2022 - 2022
# description:  caged xlsx file data processed, optimized and saved into csv file 

import pandas as pd

In [None]:
# creates panda database with all file sheets               - dfb
# creates panda dataframe with specific sheets              - df
# creates /path/files names for getting and saving data     - file_in, file_out

# files
file_in = '3-tabelas.xlsx'
file_out = 'caged.csv'

# skip initial undesireables rows
skiprows = 4  # [0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0]

dfb = pd.read_excel(file_in, sheet_name=None, skiprows=skiprows)
df = dfb['Tabela 5.1']
df


In [None]:
# optimizes data mensal evolution by dates with Saldos

months_names_br = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho',
                   'Julho', 'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']
month_names_dec = ['01',      '02',        '03',    '04',    '05',
                   '06',    '07',    '08',     '09',       '10',      '11',       '12']

# prepare dict variable to the replacements
mbr2mdec = dict(zip(months_names_br, month_names_dec))

# copying df for reutilizing
df2 = df.copy()

# selecting date cells only
df2 = df2[(df2['Mês'].str.slice(-4) >= '1900') &
          (df2['Mês'].str.slice(-4) <= '2099')]

# months name to months numbers replacements
df2['Mês'] = df2['Mês'].replace(mbr2mdec, regex=True)
df2['Mês'] = df2['Mês'].str.replace('/', '-')
df2['Mês'] = pd.to_datetime(
    df2['Mês'], format='%Y-%m-%d', infer_datetime_format=True)

# formatting 'Saldos' column
df2['Saldos'] = df2['Saldos'].round().astype(int)

# optimzes the dataframe
df2.rename(columns={'Mês': '', 'Saldos': 'total'}, inplace=True)
#df2[['', 'total']].to_csv(file_out, index=False)
first_opt_df = df2[['', 'total']]
first_opt_df


In [None]:
# creates panda dataframe with specific sheets  - df

df = pd.read_excel(file_in, sheet_name=11, skiprows=4, header=[0, 1])
df


In [6]:
# optimizes data mensal evolution by dates and regions with Saldos

regions_table = ('Região e UF', 'Unnamed: 1_level_1')
regions_names = ['Norte', 'Nordeste', 'Sudeste',
                 'Sul', 'Centro-Oeste', 'Não identificado']
months_names_br = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio',
                   'Junho', 'Julho', 'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']

# copying df for reutilizing
df2 = df.copy()

# column_name has 2 parts (date, info_type)
regions_list = []
column_names = df2.columns.values.tolist()
for column_name in column_names:
    col_date, col_info = column_name
    month = col_date.split('/')[0]
    if month in months_names_br:
        if col_info.lower().find('saldos') != -1:
            regions_list.append(column_name)


# optimzes the dataframe
df3 = df2[df2[regions_table].isin(regions_names)]
#df3 = df3[reg].transpose()
df3[regions_list].to_csv(file_out, index=False)
second_opt_df = df3[regions_list]
second_opt_df


Unnamed: 0_level_0,Janeiro/2020,Fevereiro/2020,Março/2020,Abril/2020,Maio/2020,Junho/2020,Julho/2020,Agosto/2020,Setembro/2020,Outubro/2020,...,Maio/2021,Junho/2021,Julho/2021,Agosto/2021,Setembro/2021,Outubro/2021,Novembro/2021,Dezembro/2021,Janeiro/2022,Fevereiro/2022
Unnamed: 0_level_1,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,...,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos,Saldos
1,2818.0,10793.0,-7535.0,-32789.0,-12755.0,5518.0,15771.0,22841.0,21822.0,20811.0,...,18390.0,24163.0,23541.0,22158.0,17538.0,9132.0,16709.0,-14252.0,2537.0,12727.0
9,-3397.0,2204.0,-70004.0,-153334.0,-64149.0,-10635.0,19988.0,55683.0,86278.0,64635.0,...,35530.0,49110.0,52400.0,85190.0,94307.0,50737.0,54909.0,-17862.0,3734.0,28085.0
19,32192.0,107425.0,-159720.0,-505531.0,-211141.0,-47029.0,33731.0,86373.0,118147.0,173157.0,...,149668.0,160355.0,152281.0,187941.0,142080.0,117617.0,169336.0,-146193.0,48447.0,162442.0
24,60161.0,70366.0,-40221.0,-225073.0,-91232.0,-6764.0,25929.0,36656.0,58123.0,86095.0,...,34340.0,42305.0,40953.0,55732.0,47898.0,52318.0,51265.0,-82382.0,59653.0,82898.0
28,20170.0,26426.0,-17571.0,-65107.0,-19235.0,4740.0,12989.0,12962.0,14993.0,21157.0,...,26843.0,37235.0,33932.0,31417.0,21906.0,17133.0,13338.0,-22731.0,33843.0,40930.0
33,13.0,-7.0,-4.0,0.0,69.0,136.0,2.0,8.0,10.0,164.0,...,912.0,1062.0,656.0,658.0,1055.0,1016.0,403.0,-90.0,2141.0,1425.0
