# Data Processing (raw dataset)

In [1]:
import pandas as pd
import os

### Read and handle target feature

In [2]:
target_feature = pd.read_csv('../target_feature/01_target_feature.csv',
                             sep=';')

In [3]:
target_feature.head()

Unnamed: 0,folder,file_name,number_of_words,pct_pol_neg,pct_pol_pos,pct_pol_neu,pct_pol_missing
0,ciclo_3,8998-Santo Antônio de Jesus-BA.pdf,45543,0.015063,0.032302,0.006421,0.947666
1,ciclo_3,9024-Ulianópolis-PA.pdf,17432,0.018945,0.02216,0.003846,0.955795
2,ciclo_3,9010-Aldeias Altas-MA.pdf,59605,0.022763,0.02414,0.012969,0.941438
3,ciclo_3,9034-Paraíba do Sul-RJ.pdf,15486,0.014342,0.029007,0.004393,0.95271
4,ciclo_3,9045-Governador Celso Ramos-SC.pdf,5177,0.011985,0.02513,0.00348,0.960178


In [4]:
target_feature['temp'] = target_feature['file_name'].str.replace('[0-9]|.pdf|-', ' ', regex=True)\
    .str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.lower().str.strip()
target_feature['city'] = target_feature['temp'].str[:-3]
target_feature['state'] = target_feature['temp'].str[-2:]
target_feature['city_state'] = target_feature['city'].map(str) + '_' + target_feature['state']

target_feature = target_feature.drop("temp", axis=1)
target_feature.head()

Unnamed: 0,folder,file_name,number_of_words,pct_pol_neg,pct_pol_pos,pct_pol_neu,pct_pol_missing,city,state,city_state
0,ciclo_3,8998-Santo Antônio de Jesus-BA.pdf,45543,0.015063,0.032302,0.006421,0.947666,santo antonio de jesus,ba,santo antonio de jesus_ba
1,ciclo_3,9024-Ulianópolis-PA.pdf,17432,0.018945,0.02216,0.003846,0.955795,ulianopolis,pa,ulianopolis_pa
2,ciclo_3,9010-Aldeias Altas-MA.pdf,59605,0.022763,0.02414,0.012969,0.941438,aldeias altas,ma,aldeias altas_ma
3,ciclo_3,9034-Paraíba do Sul-RJ.pdf,15486,0.014342,0.029007,0.004393,0.95271,paraiba do sul,rj,paraiba do sul_rj
4,ciclo_3,9045-Governador Celso Ramos-SC.pdf,5177,0.011985,0.02513,0.00348,0.960178,governador celso ramos,sc,governador celso ramos_sc


### Read explanatory features: education, family, fertility and work (2000 and 2010)

In [5]:
raw_dataset = target_feature

In [6]:
state_name_to_acronym = pd.DataFrame({'full_state_name': 
                                      ['acre', 
                                       'alagoas', 
                                       'amapa', 
                                       'amazonas', 
                                       'bahia', 
                                       'ceara', 
                                       'distrito_federal', 
                                       'espirito_santo', 
                                       'goias', 
                                       'maranhao', 
                                       'mato_grosso', 
                                       'mato_grosso_do_sul', 
                                       'minas_gerais', 
                                       'para', 
                                       'paraiba', 
                                       'parana', 
                                       'pernambuco', 
                                       'piaui', 
                                       'rio_de_janeiro', 
                                       'rio_grande_do_norte', 
                                       'rio_grande_do_sul', 
                                       'rondonia', 
                                       'roraima', 
                                       'santa_catarina', 
                                       'sao_paulo', 
                                       'sergipe', 
                                       'tocantins'],
                                      'acronym': ['ac',
                                                  'al',
                                                  'ap',
                                                  'am',
                                                  'ba',
                                                  'ce',
                                                  'df',
                                                  'es',
                                                  'go',
                                                  'ma',
                                                  'mt',
                                                  'ms',
                                                  'mg',
                                                  'pa',
                                                  'pb',
                                                  'pr',
                                                  'pe',
                                                  'pi',
                                                  'rj',
                                                  'rn',
                                                  'rs',
                                                  'ro',
                                                  'rr',
                                                  'sc',
                                                  'sp',
                                                  'se',
                                                  'to']})

var_list = ['var_01',
            'var_02',
            'var_03']

In [7]:
paths = ['../ibge_censo/2000/education',
         '../ibge_censo/2000/family',
         '../ibge_censo/2000/fertility',
         '../ibge_censo/2000/work',
         '../ibge_censo/2010/education',
         '../ibge_censo/2010/family',
         '../ibge_censo/2010/fertility',
         '../ibge_censo/2010/work']

In [8]:
for path in paths:
    
    for var_name in var_list:
        full_temp = pd.DataFrame()
        
        for state in os.listdir(path):
            if not state.startswith('.'):
                state_acronym = state_name_to_acronym.loc[
                    state_name_to_acronym.full_state_name == state]['acronym'].values[0]
        
                for filename in os.listdir(path + '/' + state):
                    if not filename.startswith('.') and filename.endswith(var_name + '.csv'):
        
                        temp = pd.read_csv(path + '/' + state + '/' + filename)
                        temp['city_state'] = temp['city'].map(str) + '_' + state_acronym
                        
                        full_temp = pd.concat([full_temp, temp])
        
        if full_temp.shape[0] != 0:
            full_temp = full_temp.add_prefix(path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_')
            column_to_join = path.split("/")[2] + '_' + path.split("/")[3] + '_' + var_name + '_city_state'
            
            raw_dataset = pd.merge(raw_dataset,
                                   full_temp.iloc[:,1:],
                                   left_on="city_state",
                                   right_on=column_to_join,
                                   how="left")
            
            raw_dataset = raw_dataset.drop(column_to_join, axis=1)
            
            print(path + ' [' + var_name + '] ')
        

../ibge_censo/2000/education [var_01] 
../ibge_censo/2000/family [var_01] 
../ibge_censo/2000/family [var_02] 
../ibge_censo/2000/fertility [var_01] 
../ibge_censo/2000/fertility [var_02] 
../ibge_censo/2000/fertility [var_03] 
../ibge_censo/2000/work [var_01] 
../ibge_censo/2000/work [var_02] 
../ibge_censo/2010/education [var_01] 
../ibge_censo/2010/family [var_01] 
../ibge_censo/2010/family [var_02] 
../ibge_censo/2010/fertility [var_01] 
../ibge_censo/2010/fertility [var_02] 
../ibge_censo/2010/fertility [var_03] 
../ibge_censo/2010/work [var_01] 
../ibge_censo/2010/work [var_02] 


### Read explanatory feature: social indicator (different pattern)

In [9]:
paths = ['../ibge_censo/2010/social_indicator']

### Changing city name due to city being known by two different names

In [10]:
raw_dataset.loc[raw_dataset.file_name=='3238-São Valério da Natividade-TO.pdf', 'city_state'] = 'sao valerio_to'

In [11]:
for path in paths:
    
    for var_name in var_list:
        full_temp = pd.DataFrame()
        
        for state in os.listdir(path):
            if not state.startswith('.'):
                state_acronym = state_name_to_acronym.loc[
                    state_name_to_acronym.full_state_name == state]['acronym'].values[0]
                
                for filename in os.listdir(path + '/' + state):
                    if not filename.startswith('.') and filename.endswith(var_name + '.csv'):
                        
                        temp = pd.read_csv(path + '/' + state + '/' + filename)
                        temp['city_state'] = temp['city'].map(str) + '_' + state_acronym
                        
                        full_temp = pd.concat([full_temp, temp])
                        
        if full_temp.shape[0] != 0:
            full_temp = full_temp.add_prefix(path.split("/")[3] + '_' + var_name + '_')
            column_to_join = path.split("/")[3] + '_' + var_name + '_city_state'
            
            raw_dataset = pd.merge(raw_dataset,
                                   full_temp.iloc[:,1:],
                                   left_on="city_state",
                                   right_on=column_to_join,
                                   how="left")
            
            raw_dataset = raw_dataset.drop(column_to_join, axis=1)
            
            print(path + ' [' + var_name + '] ')

../ibge_censo/2010/social_indicator [var_01] 
../ibge_censo/2010/social_indicator [var_02] 
../ibge_censo/2010/social_indicator [var_03] 


### Read explanatory feature: enem score (different pattern)

In [12]:
paths = ['../enem/2000/2000_enem_score_var_01.csv',
         '../enem/2010/2010_enem_score_var_01.csv']

In [13]:
for path in paths:
    temp = pd.read_csv(path, sep=';')
    temp = temp.iloc[1:,:]
    temp = temp.add_prefix(path.split("/")[2] + '_enem_var_01_')
    column_to_join = path.split("/")[2] + '_enem_var_01_city_state'

    raw_dataset = pd.merge(raw_dataset,
                           temp,
                           left_on="city_state",
                           right_on=column_to_join,
                           how="left")
    
    raw_dataset = raw_dataset.drop(column_to_join, axis=1)
    
raw_dataset.iloc[:, 88:95] = raw_dataset.iloc[:, 88:95].fillna(0)

In [14]:
for column in raw_dataset.columns:
    print(column)

folder
file_name
number_of_words
pct_pol_neg
pct_pol_pos
pct_pol_neu
pct_pol_missing
city
state
city_state
2000_education_var_01_quantity
2000_family_var_01_total
2000_family_var_01_suitable
2000_family_var_01_semi_suitable
2000_family_var_01_inappropriate
2000_family_var_02_quantity
2000_fertility_var_01_total
2000_fertility_var_01_has_children
2000_fertility_var_01_children_born
2000_fertility_var_01_children_borned_live
2000_fertility_var_01_children_borned_dead
2000_fertility_var_02_total
2000_fertility_var_02_married
2000_fertility_var_02_separated
2000_fertility_var_02_divorced
2000_fertility_var_02_widow
2000_fertility_var_02_single
2000_fertility_var_03_total
2000_work_var_01_total
2000_work_var_01_domestic_regular
2000_work_var_01_domestic_irregular
2000_work_var_01_other_regular
2000_work_var_01_military_and_gov
2000_work_var_01_other_irregular
2000_work_var_02_total
2000_work_var_02_regular
2000_work_var_02_military_and_gov
2000_work_var_02_irregular
2000_work_var_02_employe

In [15]:
raw_dataset.to_csv('02_01_raw_dataset.csv',
                   sep=';',
                   index=False)