In [19]:
# CAGED (Cadastro Geral de Empregados e Desempregados)
# name:         caged_xlsx_to_csv
# date:         29/03/2022 - 01/04/2022
# description:  caged xlsx file data processed, optimized and saved into csv file

import pandas as pd


In [20]:
# creates panda database with all file sheets               - dfb
# creates panda dataframe with specific sheets              - first_df
# creates /path/files names for getting and saving data     - file_in, file_out

file_in = '3-tabelas_MAR.xlsx'
file_out = 'caged.csv'

first_df = pd.read_excel(file_in, sheet_name=7, skiprows=4)


Unnamed: 0.1,Unnamed: 0,Mês,Estoque,Admissões,Desligamentos,Saldos,Variação Relativa (%)
0,,Janeiro/2020,38228184.0,1510568.0,1398611.0,111957.0,----
1,,Fevereiro/2020,38445391.0,1614177.0,1396970.0,217207.0,0.568186
2,,Março/2020,38150336.0,1460074.0,1755129.0,-295055.0,-0.767465
3,,Abril/2020,37168502.0,665007.0,1646841.0,-981834.0,-2.573592
4,,Maio/2020,36770059.0,762009.0,1160452.0,-398443.0,-1.071991
5,,Junho/2020,36716025.0,962543.0,1016577.0,-54034.0,-0.146951
6,,Julho/2020,36824435.0,1177481.0,1069071.0,108410.0,0.295266
7,,Agosto/2020,37038958.0,1329407.0,1114884.0,214523.0,0.582556
8,,Setembro/2020,37338331.0,1491114.0,1191741.0,299373.0,0.808265
9,,Outubro/2020,37704350.0,1663104.0,1297085.0,366019.0,0.980277


In [21]:
# optimizes data mensal evolution by dates with Saldos

months_names_br = ('Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho',
                   'Julho', 'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro')
month_names_dec = ('01',      '02',        '03',    '04',    '05',
                   '06',    '07',    '08',     '09',       '10',      '11',       '12')

# prepare dict variable to the replacements
mbr2mdec = dict(zip(months_names_br, month_names_dec))

df2 = first_df.copy()

# selecting date cells only
df2 = df2[(df2['Mês'].str.slice(-4) >= '1900') &
          (df2['Mês'].str.slice(-4) <= '2099')]

# months name to months numbers replacements
df2['Mês'] = df2['Mês'].replace(mbr2mdec, regex=True)
df2['Mês'] = df2['Mês'].str.replace('/', '-')
df2['Mês'] = pd.to_datetime(
    df2['Mês'], format='%Y-%m-%d', infer_datetime_format=True)

# function to verify invalid types


def try_int(val):
    try:
        val = float(val)
        val = int(val)
        return True
    except:
        return False


# filter invalid types
filter = df2['Saldos'].apply(try_int)
df2.drop(df2[filter == False].index, inplace=True)

df2['Saldos'] = df2['Saldos'].astype(int)
first_out_df = df2[['Mês', 'Saldos']]
first_out_df


Unnamed: 0,Mês,Saldos
0,2020-01-01,111957
1,2020-02-01,217207
2,2020-03-01,-295055
3,2020-04-01,-981834
4,2020-05-01,-398443
5,2020-06-01,-54034
6,2020-07-01,108410
7,2020-08-01,214523
8,2020-09-01,299373
9,2020-10-01,366019


In [22]:
# creates panda dataframe with specific sheets  - second_df


second_df = pd.read_excel(file_in, sheet_name=11, skiprows=4, header=[0, 1])


In [23]:
# optimizes data mensal evolution by dates and regions with Saldos

regions_table = ('Região e UF', 'Unnamed: 1_level_1')
regions_names = ('Norte', 'Nordeste', 'Sudeste',
                 'Sul', 'Centro-Oeste', 'Não identificado')

# copying df for reutilizing
df2 = second_df.copy()


# function to create a dataframe columns list to use
def createColumnsList(param_df):
    columns_list = []
    column_names = param_df.columns.values.tolist()
    for column_name in column_names:
        # column_name has 2 parts (date, info_type)
        col_date, col_info = column_name
        month = col_date.split('/')[0]
        if month in months_names_br:
            if col_info.lower().find('saldos') != -1:
                columns_list.append(column_name)

    return(columns_list)


# function to remove invalid values
def cleanList(cl_df2, list_to_clean):
    for month in list_to_clean:
        filter = cl_df2[month].apply(try_int)
        df_tmp = cl_df2[month]
        df_tmp.transpose
        cl_df2.drop(df_tmp[filter == False].index, inplace=True)

    return(cl_df2)


months_list = createColumnsList(df2)
df2 = cleanList(df2, months_list)

# selecting specifics regions
df3 = df2[df2[regions_table].isin(regions_names)]
second_out_df = df3[months_list]

second_out_df = df3[months_list].astype(int)
second_out_df = second_out_df.transpose()

second_out_df


Unnamed: 0,Unnamed: 1,1,9,19,24,28,33
Janeiro/2020,Saldos,2818,-3397,32192,60161,20170,13
Fevereiro/2020,Saldos,10793,2204,107425,70366,26426,-7
Março/2020,Saldos,-7535,-70004,-159720,-40221,-17571,-4
Abril/2020,Saldos,-32789,-153334,-505531,-225073,-65107,0
Maio/2020,Saldos,-12755,-64149,-211141,-91232,-19235,69
Junho/2020,Saldos,5518,-10635,-47029,-6764,4740,136
Julho/2020,Saldos,15771,19988,33731,25929,12989,2
Agosto/2020,Saldos,22841,55683,86373,36656,12962,8
Setembro/2020,Saldos,21822,86278,118147,58123,14993,10
Outubro/2020,Saldos,20811,64635,173157,86095,21157,164


In [24]:
# creates panda dataframe with specific sheets  - third_df

third_df = pd.read_excel(file_in, sheet_name=9, skiprows=4, header=[0, 1])


In [25]:
# optimizes data mensal evolution by dates and sectors with Saldos

gr_act_econ_table = (
    'Grupamento de Atividades Econômicas e Seção CNAE 2.0', 'Unnamed: 1_level_1')
sector_names = ('Agricultura, pecuária, produção florestal, pesca e aquicultura', 'Indústria geral', 'Construção',
                'Comércio; reparação de veículos automotores e motocicletas', 'Serviços', 'Não identificado***')

# copying df for reutilizing
df2 = third_df.copy()

sectors_list = createColumnsList(df2)
df2 = cleanList(df2, sectors_list)


# selecting specifics sectors
df3 = df2[df2[gr_act_econ_table].isin(sector_names)]
third_out_df = df3[sectors_list]

third_out_df = df3[sectors_list].astype(int)
third_out_df = third_out_df.transpose()
third_out_df


Unnamed: 0,Unnamed: 1,1,2,7,8,9,27
Janeiro/2020,Saldos,16485,58261,35866,-51726,53071,0
Fevereiro/2020,Saldos,3666,39971,25990,9053,138528,-1
Março/2020,Saldos,-8575,-42788,-17646,-89225,-136821,0
Abril/2020,Saldos,-6833,-217738,-75300,-265612,-416351,0
Maio/2020,Saldos,14792,-107815,-23958,-106402,-175059,-1
Junho/2020,Saldos,37710,-8936,15568,-27560,-70816,0
Julho/2020,Saldos,22470,49444,40722,20805,-25031,0
Agosto/2020,Saldos,8025,90198,50520,43706,22074,0
Setembro/2020,Saldos,5321,111316,47379,66418,68939,0
Outubro/2020,Saldos,-4501,84138,35035,110621,140726,0


In [26]:
# creates csv file

first_columns_names_out = {'Mês': '', 'Saldos': 'total'}
second_columns_names_out = {1: 'reg1', 9: 'reg2',
                            19: 'reg3', 24: 'reg4', 28: 'reg5', 33: 'reg9'}
third_columns_names_out = {1: 'setA', 2: 'setE',
                           7: 'setF', 8: 'setG', 9: 'setU', 27: 'setZ'}

temp_first_out_df = first_out_df.reset_index(drop=True)
temp_first_out_df.rename(columns=first_columns_names_out, inplace=True)

temp_second_out_df = second_out_df.reset_index(drop=True)
temp_second_out_df.rename(columns=second_columns_names_out, inplace=True)

temp_third_out_df = third_out_df.reset_index(drop=True)
temp_third_out_df.rename(columns=third_columns_names_out, inplace=True)

final_out_df = pd.concat(
    [temp_first_out_df, temp_second_out_df, temp_third_out_df], axis=1)
final_out_df.to_csv(file_out, index=False)
final_out_df


Unnamed: 0,Unnamed: 1,total,reg1,reg2,reg3,reg4,reg5,reg9,setA,setE,setF,setG,setU,setZ
0,2020-01-01,111957,2818,-3397,32192,60161,20170,13,16485,58261,35866,-51726,53071,0
1,2020-02-01,217207,10793,2204,107425,70366,26426,-7,3666,39971,25990,9053,138528,-1
2,2020-03-01,-295055,-7535,-70004,-159720,-40221,-17571,-4,-8575,-42788,-17646,-89225,-136821,0
3,2020-04-01,-981834,-32789,-153334,-505531,-225073,-65107,0,-6833,-217738,-75300,-265612,-416351,0
4,2020-05-01,-398443,-12755,-64149,-211141,-91232,-19235,69,14792,-107815,-23958,-106402,-175059,-1
5,2020-06-01,-54034,5518,-10635,-47029,-6764,4740,136,37710,-8936,15568,-27560,-70816,0
6,2020-07-01,108410,15771,19988,33731,25929,12989,2,22470,49444,40722,20805,-25031,0
7,2020-08-01,214523,22841,55683,86373,36656,12962,8,8025,90198,50520,43706,22074,0
8,2020-09-01,299373,21822,86278,118147,58123,14993,10,5321,111316,47379,66418,68939,0
9,2020-10-01,366019,20811,64635,173157,86095,21157,164,-4501,84138,35035,110621,140726,0
