# Extract Explanatory Features

In [64]:
import os
import shutil
import pandas as pd

### Extract features from enem 2000

In [65]:
columns_to_select = ['NO_MUNICIPIO_RESIDENCIA',
                     'SG_UF_RESIDENCIA',
                     'TP_PRESENCA',
                     'TP_STATUS_REDACAO',
                     'NU_NOTA_OBJETIVA',
                     'NU_NOTA_REDACAO']

In [66]:
df = pd.read_csv('2000/Dados/MICRODADOS_ENEM_2000.csv',
                 sep=';',
                 usecols=columns_to_select)

df = df.query('TP_PRESENCA == 1 and TP_STATUS_REDACAO == "P"')

In [67]:
df['city_state'] = (df.NO_MUNICIPIO_RESIDENCIA + '_' + df.SG_UF_RESIDENCIA).str.normalize('NFKD')\
                    .str.encode('ascii', errors='ignore')\
                    .str.decode('utf-8').str.lower()

df['enem_score'] = (df['NU_NOTA_OBJETIVA']+df['NU_NOTA_REDACAO'])/2

In [68]:
df_grouped = df[['city_state', 'enem_score']].groupby('city_state').agg(['mean', 'std', 'median'])

df_grouped.columns = ['enem_score_mean', 'enem_score_std', 'enem_score_median']

In [69]:
df_grouped.to_csv('2000/2000_enem_score_var_01.csv', sep=';')

### Extract features from enem 2010

In [70]:
columns_to_select = ['NO_MUNICIPIO_RESIDENCIA',
                     'SG_UF_RESIDENCIA',
                     'TP_PRESENCA_CN',
                     'TP_PRESENCA_CH',
                     'TP_PRESENCA_LC',
                     'TP_PRESENCA_MT',
                     'NU_NOTA_CN',
                     'NU_NOTA_CH',
                     'NU_NOTA_LC',
                     'NU_NOTA_MT',
                     'TP_STATUS_REDACAO',
                     'NU_NOTA_REDACAO']

In [71]:
df = pd.read_csv('2010/Dados/MICRODADOS_ENEM_2010.csv',
                 sep=';',
                 usecols=columns_to_select)

df = df.query('TP_PRESENCA_CN == 1 and TP_PRESENCA_CH == 1 and TP_PRESENCA_LC == 1 and TP_PRESENCA_MT == 1 and TP_STATUS_REDACAO == "P"')

  interactivity=interactivity, compiler=compiler, result=result)


In [73]:
df['city_state'] = (df.NO_MUNICIPIO_RESIDENCIA + '_' + df.SG_UF_RESIDENCIA).str.normalize('NFKD')\
                    .str.encode('ascii', errors='ignore')\
                    .str.decode('utf-8').str.lower()

df['enem_score'] = (df['NU_NOTA_CN']+df['NU_NOTA_CH']+df['NU_NOTA_LC']+df['NU_NOTA_MT']+df['NU_NOTA_REDACAO'])/5

In [74]:
df_grouped = df[['city_state', 'enem_score']].groupby('city_state').agg(['mean', 'std', 'median'])

df_grouped.columns = ['enem_score_mean', 'enem_score_std', 'enem_score_median']

In [76]:
df_grouped.to_csv('2010/2010_enem_score_var_01.csv', sep=';')