### Create datasets for training and validation

In [344]:
import pandas as pd
import re
import numpy as np

### Read target feature

In [345]:
target_feature = pd.read_csv('../target_feature/01_create_target_feature.csv',
                             sep=';')

In [346]:
target_feature['temp'] = target_feature['filename'].str.replace('[0-9]|.pdf|-', ' ', regex=True)\
    .str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.strip()
target_feature['city'] = target_feature['temp'].str[:-3]
target_feature['state'] = target_feature['temp'].str[-2:]
target_feature['city_state'] = target_feature['city'].map(str) + '_' + target_feature['state']

target_feature = target_feature.drop("temp", axis=1)
target_feature.head()

Unnamed: 0,folder,filename,number_of_words,pct_pol_neg,pct_pol_pos,pct_pol_neu,city,state,city_state
0,ciclo_3,8998-Santo Antônio de Jesus-BA.pdf,45400,0.012207,0.023555,0.964591,santo antonio de jesus,ba,santo antonio de jesus_ba
1,ciclo_3,9024-Ulianópolis-PA.pdf,17199,0.015878,0.015645,0.968768,ulianopolis,pa,ulianopolis_pa
2,ciclo_3,9010-Aldeias Altas-MA.pdf,58870,0.011997,0.015923,0.972471,aldeias altas,ma,aldeias altas_ma
3,ciclo_3,9034-Paraíba do Sul-RJ.pdf,14129,0.012318,0.020671,0.967224,paraiba do sul,rj,paraiba do sul_rj
4,ciclo_3,9045-Governador Celso Ramos-SC.pdf,5201,0.009617,0.017311,0.973456,governador celso ramos,sc,governador celso ramos_sc


### Read explanatory features

In [348]:
paths = ['../ibge_censo/2000/education',
         '../ibge_censo/2000/family',
         '../ibge_censo/2000/fertility',
         '../ibge_censo/2000/work',
         '../ibge_censo/2010/education',
         '../ibge_censo/2010/family',
         '../ibge_censo/2010/fertility',
         #'../ibge_censo/2010/social_indicator',
         '../ibge_censo/2010/work']

state_name_to_acronym = pd.DataFrame({'full_state_name': ['acre', 'alagoas', 'amapa', 'amazonas', 'bahia', 'ceara', 'distrito_federal', 'espirito_santo', 'goias', 'maranhao', 'mato_grosso', 'mato_grosso_do_sul', 'minas_gerais', 'para', 'paraiba', 'parana', 'pernambuco', 'piaui', 'rio_de_janeiro', 'rio_grande_do_norte', 'rio_grande_do_sul', 'rondonia', 'roraima', 'santa_catarina', 'sao_paulo', 'sergipe', 'tocantins'],
                                      'acronym': ['ac', 'al', 'ap', 'am', 'ba', 'ce', 'df', 'es', 'go', 'ma', 'mg', 'ms', 'mg', 'pa', 'pb', 'pr', 'pe', 'pi', 'rj', 'rn', 'rs', 'rn', 'rr', 'sc', 'sp', 'se', 'to']})

var_list = ['var_01',
           'var_02',
           'var_03']

In [349]:
import os

In [350]:
target_feature2 = target_feature

In [339]:
#paths = ['../ibge_censo/2010/fertility']

for var_name in var_list:
    full_temp = pd.DataFrame()
    
    for path in paths:
        print(path)
        
        for state in os.listdir(path):
            if not state.startswith('.'):
                state_acronym = state_name_to_acronym.loc[state_name_to_acronym.full_state_name == state]['acronym'].values[0]
        
                for filename in os.listdir(path + '/' + state):
                    if not filename.startswith('.') and filename.endswith(var_name + '.csv'):
        
                        temp = pd.read_csv(path + '/' + state + '/' + filename)
                        temp['city_state'] = temp['city'].map(str) + '_' + state_acronym
                        
                        full_temp = pd.concat([full_temp, temp])
        
        full_temp = full_temp.add_prefix(path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_')
        column_to_join = path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_city_state'
        
        target_feature = pd.merge(target_feature,
                                  full_temp.iloc[:,1:],
                                  left_on="city_state",
                                  right_on=column_to_join,
                                  how="left")
        
        target_feature = target_feature.drop(column_to_join, axis=1)
        

../ibge_censo/2010/fertility


In [340]:
full_temp.head()

Unnamed: 0,2010_fertility_var_01_city,2010_fertility_var_01_total,2010_fertility_var_01_has_children,2010_fertility_var_01_children_born,2010_fertility_var_01_children_borned_live,2010_fertility_var_01_children_borned_dead,2010_fertility_var_01_city_state
0,abadia dos dourados,2906.961908,1972.099674,5619.793205,5424.434805,195.3584,abadia dos dourados_mg
1,abaete,10040.76944,6460.993784,19358.390255,18676.345856,682.044399,abaete_mg
2,abre campo,5682.389615,3512.03574,12875.246655,12449.934418,425.312237,abre campo_mg
3,acaiaca,1763.406352,1118.360598,4393.116653,4061.905542,331.21111,acaiaca_mg
4,acucena,4363.801991,2721.142363,11688.806913,10919.651802,769.155112,acucena_mg


In [343]:
target_feature2.head()

Unnamed: 0,folder,filename,number_of_words,pct_pol_neg,pct_pol_pos,pct_pol_neu,city,state,city_state,2010_fertility_var_01_total,2010_fertility_var_01_has_children,2010_fertility_var_01_children_born,2010_fertility_var_01_children_borned_live,2010_fertility_var_01_children_borned_dead,2010_fertility_var_01_city_state
0,ciclo_3,8998-Santo Antônio de Jesus-BA.pdf,45400,0.012207,0.023555,0.964591,santo antonio de jesus,ba,santo antonio de jesus_ba,41402.135889,25227.975864,78320.116786,74410.117444,3909.999342,santo antonio de jesus_ba
1,ciclo_3,9024-Ulianópolis-PA.pdf,17199,0.015878,0.015645,0.968768,ulianopolis,pa,ulianopolis_pa,16224.474983,8458.866082,26639.657593,25628.505845,1011.151748,ulianopolis_pa
2,ciclo_3,9010-Aldeias Altas-MA.pdf,58870,0.011997,0.015923,0.972471,aldeias altas,ma,aldeias altas_ma,8704.726215,5706.064944,25248.789846,23109.87445,2138.915396,aldeias altas_ma
3,ciclo_3,9034-Paraíba do Sul-RJ.pdf,14129,0.012318,0.020671,0.967224,paraiba do sul,rj,paraiba do sul_rj,18581.962133,11598.990046,31672.203536,30360.627626,1311.57591,paraiba do sul_rj
4,ciclo_3,9045-Governador Celso Ramos-SC.pdf,5201,0.009617,0.017311,0.973456,governador celso ramos,sc,governador celso ramos_sc,5599.220201,3839.637559,11614.094555,11255.596552,358.498003,governador celso ramos_sc


In [341]:
#2000/education
path

'../ibge_censo/2010/fertility'

In [None]:
2000_education_var_01_total

In [331]:
path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_'

'2010_fertility_var_01_'

In [299]:
target_feature2 = pd.merge(target_feature,
                          full_temp[['city_state', 'quantity']],
                     left_on="city_state",
                     right_on="city_state",
                     how="left")