# Extract Explanatory Features

In [1]:
import os
import shutil
import pandas as pd

### Extract features from enem score 2000

In [20]:
columns_to_select = ['NO_MUNICIPIO_RESIDENCIA',
                     'SG_UF_RESIDENCIA',
                     'TP_PRESENCA',
                     'TP_STATUS_REDACAO',
                     'NU_NOTA_OBJETIVA',
                     'NU_NOTA_REDACAO']

In [27]:
df = pd.read_csv('2000/Dados/MICRODADOS_ENEM_2000.csv',
                 sep=';',
                 usecols=columns_to_select)

df = df.query('TP_PRESENCA == 1 and TP_STATUS_REDACAO == "P"')

In [28]:
df['city_state'] = (df.NO_MUNICIPIO_RESIDENCIA + '_' + df.SG_UF_RESIDENCIA).str.normalize('NFKD')\
                    .str.encode('ascii', errors='ignore')\
                    .str.decode('utf-8').str.lower()

df['enem_score'] = (df['NU_NOTA_OBJETIVA']+df['NU_NOTA_REDACAO'])/2

In [44]:
df_grouped = df[['city_state', 'enem_score']].groupby('city_state').agg(['mean', 'std', 'median'])

df_grouped.columns = ['enem_score_mean', 'enem_score_std', 'enem_score_median']

In [47]:
df_grouped.to_csv('2000/2000_enem_score_var_01.csv', sep=';')

### Extract features from enem score 2010

In [54]:
columns_to_select = ['NO_MUNICIPIO_RESIDENCIA',
                     'SG_UF_RESIDENCIA',
                     'TP_PRESENCA_CN',
                     'TP_PRESENCA_CH',
                     'TP_PRESENCA_LC',
                     'TP_PRESENCA_MT',
                     'NU_NOTA_CN',
                     'NU_NOTA_CH',
                     'NU_NOTA_LC',
                     'NU_NOTA_MT',
                     'TP_STATUS_REDACAO',
                     'NU_NOTA_REDACAO']

In [55]:
df = pd.read_csv('2010/Dados/MICRODADOS_ENEM_2010.csv',
                 sep=';',
                 usecols=columns_to_select)

df = df.query('TP_PRESENCA_CN == 1 and TP_PRESENCA_CH == 1 and TP_PRESENCA_LC == 1 and TP_PRESENCA_MT == 1 and TP_STATUS_REDACAO == "P"')

  interactivity=interactivity, compiler=compiler, result=result)


In [59]:
df.head()

Unnamed: 0,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,TP_STATUS_REDACAO,NU_NOTA_REDACAO
0,MONTES CLAROS,MG,1.0,1.0,1.0,1.0,675.0,787.7,698.0,697.0,P,650.0
1,PALMAS,TO,1.0,1.0,1.0,1.0,504.2,548.2,431.4,415.9,P,350.0
2,MANAUS,AM,1.0,1.0,1.0,1.0,517.4,562.0,563.6,463.1,P,450.0
5,RIO DE JANEIRO,RJ,1.0,1.0,1.0,1.0,643.1,697.6,635.0,868.3,P,750.0
6,PETROPOLIS,RJ,1.0,1.0,1.0,1.0,589.8,606.4,541.9,676.8,P,600.0


In [60]:
df['city_state'] = (df.NO_MUNICIPIO_RESIDENCIA + '_' + df.SG_UF_RESIDENCIA).str.normalize('NFKD')\
                    .str.encode('ascii', errors='ignore')\
                    .str.decode('utf-8').str.lower()

df['enem_score'] = (df['NU_NOTA_CN']+df['NU_NOTA_CH']+df['NU_NOTA_LC']+df['NU_NOTA_MT']+df['NU_NOTA_REDACAO'])/5

In [61]:
df_grouped = df[['city_state', 'enem_score']].groupby('city_state').agg(['mean', 'std', 'median'])

df_grouped.columns = ['enem_score_mean', 'enem_score_std', 'enem_score_median']

In [62]:
df_grouped.head()

Unnamed: 0_level_0,enem_score_mean,enem_score_std,enem_score_median
city_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abadia de goias_go,495.967921,70.118688,495.9
abadia dos dourados_mg,534.714815,70.384115,537.12
abadiania_go,500.849244,66.057396,503.5
abaete_mg,555.804108,68.085758,556.84
abaetetuba_pa,503.221837,69.38867,499.11


In [63]:
df_grouped.to_csv('2010/2010_enem_score_var_01.csv', sep=';')