### Create datasets for training and validation

In [382]:
import pandas as pd
import re
import numpy as np
import os

### Read target feature

In [383]:
target_feature = pd.read_csv('../target_feature/01_create_target_feature.csv',
                             sep=';')

In [384]:
target_feature['temp'] = target_feature['filename'].str.replace('[0-9]|.pdf|-', ' ', regex=True)\
    .str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.strip()
target_feature['city'] = target_feature['temp'].str[:-3]
target_feature['state'] = target_feature['temp'].str[-2:]
target_feature['city_state'] = target_feature['city'].map(str) + '_' + target_feature['state']

target_feature = target_feature.drop("temp", axis=1)
target_feature.head()

Unnamed: 0,folder,filename,number_of_words,pct_pol_neg,pct_pol_pos,pct_pol_neu,city,state,city_state
0,ciclo_3,8998-Santo Antônio de Jesus-BA.pdf,45400,0.012207,0.023555,0.964591,santo antonio de jesus,ba,santo antonio de jesus_ba
1,ciclo_3,9024-Ulianópolis-PA.pdf,17199,0.015878,0.015645,0.968768,ulianopolis,pa,ulianopolis_pa
2,ciclo_3,9010-Aldeias Altas-MA.pdf,58870,0.011997,0.015923,0.972471,aldeias altas,ma,aldeias altas_ma
3,ciclo_3,9034-Paraíba do Sul-RJ.pdf,14129,0.012318,0.020671,0.967224,paraiba do sul,rj,paraiba do sul_rj
4,ciclo_3,9045-Governador Celso Ramos-SC.pdf,5201,0.009617,0.017311,0.973456,governador celso ramos,sc,governador celso ramos_sc


### Read explanatory features

In [402]:
full_dataset = target_feature

In [403]:
paths = ['../ibge_censo/2000/education',
         '../ibge_censo/2000/family',
         '../ibge_censo/2000/fertility',
         '../ibge_censo/2000/work',
         '../ibge_censo/2010/education',
         '../ibge_censo/2010/family',
         '../ibge_censo/2010/fertility',
         #'../ibge_censo/2010/social_indicator',
         '../ibge_censo/2010/work']

state_name_to_acronym = pd.DataFrame({'full_state_name': ['acre', 'alagoas', 'amapa', 'amazonas', 'bahia', 'ceara', 'distrito_federal', 'espirito_santo', 'goias', 'maranhao', 'mato_grosso', 'mato_grosso_do_sul', 'minas_gerais', 'para', 'paraiba', 'parana', 'pernambuco', 'piaui', 'rio_de_janeiro', 'rio_grande_do_norte', 'rio_grande_do_sul', 'rondonia', 'roraima', 'santa_catarina', 'sao_paulo', 'sergipe', 'tocantins'],
                                      'acronym': ['ac', 'al', 'ap', 'am', 'ba', 'ce', 'df', 'es', 'go', 'ma', 'mg', 'ms', 'mg', 'pa', 'pb', 'pr', 'pe', 'pi', 'rj', 'rn', 'rs', 'rn', 'rr', 'sc', 'sp', 'se', 'to']})

var_list = ['var_01',
            'var_02',
            'var_03']

In [404]:
for var_name in var_list:
    
    for path in paths:
        full_temp = pd.DataFrame()
        
        print(var_name + ' --> ' + path)
        
        for state in os.listdir(path):
            if not state.startswith('.'):
                state_acronym = state_name_to_acronym.loc[state_name_to_acronym.full_state_name == state]['acronym'].values[0]
        
                for filename in os.listdir(path + '/' + state):
                    if not filename.startswith('.') and filename.endswith(var_name + '.csv'):
        
                        temp = pd.read_csv(path + '/' + state + '/' + filename)
                        temp['city_state'] = temp['city'].map(str) + '_' + state_acronym
                        
                        full_temp = pd.concat([full_temp, temp])
        
        if full_temp.shape[0] != 0:
            full_temp = full_temp.add_prefix(path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_')
            column_to_join = path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_city_state'
            
            full_dataset = pd.merge(full_dataset,
                                      full_temp.iloc[:,1:],
                                      left_on="city_state",
                                      right_on=column_to_join,
                                      how="left")
            
            full_dataset = full_dataset.drop(column_to_join, axis=1)
        

var_01 --> ../ibge_censo/2000/education
var_01 --> ../ibge_censo/2000/family
var_01 --> ../ibge_censo/2000/fertility
var_01 --> ../ibge_censo/2000/work
var_01 --> ../ibge_censo/2010/education
var_01 --> ../ibge_censo/2010/family
var_01 --> ../ibge_censo/2010/fertility
var_01 --> ../ibge_censo/2010/work
var_02 --> ../ibge_censo/2000/education
var_02 --> ../ibge_censo/2000/family
var_02 --> ../ibge_censo/2000/fertility
var_02 --> ../ibge_censo/2000/work
var_02 --> ../ibge_censo/2010/education
var_02 --> ../ibge_censo/2010/family
var_02 --> ../ibge_censo/2010/fertility
var_02 --> ../ibge_censo/2010/work
var_03 --> ../ibge_censo/2000/education
var_03 --> ../ibge_censo/2000/family
var_03 --> ../ibge_censo/2000/fertility
var_03 --> ../ibge_censo/2000/work
var_03 --> ../ibge_censo/2010/education
var_03 --> ../ibge_censo/2010/family
var_03 --> ../ibge_censo/2010/fertility
var_03 --> ../ibge_censo/2010/work


#### Read social indicator features (not in same pattern)

In [401]:
for c in full_dataset.columns:
    print(c)

folder
filename
number_of_words
pct_pol_neg
pct_pol_pos
pct_pol_neu
city
state
city_state
2000_education_var_01_quantity
2000_family_var_01_total
2000_family_var_01_adequada
2000_family_var_01_semi_adequada
2000_family_var_01_inadequada
2000_fertility_var_01_total
2000_fertility_var_01_has_children
2000_fertility_var_01_children_born
2000_fertility_var_01_children_borned_live
2000_fertility_var_01_children_borned_dead
2000_work_var_01_total
2000_work_var_01_domestic_regular
2000_work_var_01_domestic_irregular
2000_work_var_01_other_regular
2000_work_var_01_military_and_gov
2000_work_var_01_other_irregular
2010_education_var_01_quantity
2010_family_var_01_total
2010_family_var_01_adequada
2010_family_var_01_semi_adequada
2010_family_var_01_inadequada
2010_fertility_var_01_total
2010_fertility_var_01_has_children
2010_fertility_var_01_children_born
2010_fertility_var_01_children_borned_live
2010_fertility_var_01_children_borned_dead
2010_work_var_01_total
2010_work_var_01_main_regular
20

In [405]:
full_dataset.to_csv('02_data_processing_01_raw_dataset.csv', ';')