## General set-up

In [221]:
import random
import numpy as np
import pandas as pd

In [222]:
pd.set_option('mode.chained_assignment', None)

In [223]:
p = 0.01

df = pd.read_csv(
    'MICRODADOS_ENEM_2019.csv',
    header=0,
    encoding = "ISO-8859-1",
    skiprows=lambda i: i > 0 and random.random() > p,
    sep=';'
)

df.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,190001004739,2019,1501402,Belém,15,PA,19,F,1,3,...,A,A,A,D,B,A,E,A,A,A
1,190001004814,2019,1500602,Altamira,15,PA,19,F,1,2,...,A,A,A,B,A,A,B,A,A,A
2,190001004838,2019,1506138,Redenção,15,PA,19,F,1,3,...,A,A,A,B,A,A,D,A,B,A
3,190001005017,2019,1501402,Belém,15,PA,20,F,1,2,...,A,A,A,B,A,A,B,A,A,A
4,190001005242,2019,1501402,Belém,15,PA,18,M,1,3,...,B,A,A,B,B,B,C,A,A,B


## Data Wrangling

In [226]:
answers = ['TX_RESPOSTAS_CN', 'TX_RESPOSTAS_CH', 'TX_RESPOSTAS_LC', 'TX_RESPOSTAS_MT']
correct_answers = ['TX_GABARITO_CN', 'TX_GABARITO_CH', 'TX_GABARITO_LC', 'TX_GABARITO_MT']
scores = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT']
foreign_language = ['TP_LINGUA']
subset = df[foreign_language+answers+correct_answers+scores+['NU_NOTA_REDACAO']]

In [227]:
subset.head()

Unnamed: 0,TP_LINGUA,TX_RESPOSTAS_CN,TX_RESPOSTAS_CH,TX_RESPOSTAS_LC,TX_RESPOSTAS_MT,TX_GABARITO_CN,TX_GABARITO_CH,TX_GABARITO_LC,TX_GABARITO_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO
0,0,,EDCEAABACBECBCEEADBBEEDCECEBCDBCDEBCBABACACAA,ADDDB99999DAADCDBEDACAEEDCCABCCACACECBACECABBB...,,,BACCCBABBADCBCEEEBCACACEEDBCCADBEADBADBBBACDB,BDABEABCADBCBAADDBECDAAECDAECBECBCCDEEAAADDBBC...,,,432.6,392.0,,520.0
1,1,,CEABCAEDDBADBDADEECBDEAEBCACDBEACBEDCEDABCEDA,99999DEDEAABDCEABB.CADEEBCDDDACEBDBAEBADADECCA...,,,BACCCBABBADCBCEEEBCACACEEDBCCADBEADBADBBBACDB,BDABEABCADBCBAADDBECDAAECDAECBECBCCDEEAAADDBBC...,,,413.8,380.5,,340.0
2,0,BCEBBCDACDBCEAECEAADBACDDBAEECBBADECEAEDCDCDD,EABCCBCABBDDBCCDEDCACAECEDBCBAEBAEDBAABEDABCB,BDABE99999CACBDEDDBCCBEADDAEBCDEBDCDAEBAADDADB...,DECBABCAAEEBCCAEAECBCDEBABDEAEACABDABBADDBCEB,DADCCEBBCCACBEEBEEBACBCDDDDADBCBBCEAEADEADAAE,BACCCBABBADCBCEEEBCACACEEDBCCADBEADBADBBBACDB,BDABEABCADBCBAADDBECDAAECDAECBECBCCDEEAAADDBBC...,DBEBACABCDBABECEEEDCBDCCEDCDABEDAADDDECACAECB,456.4,576.8,523.1,583.2,860.0
3,1,DDEEDDDCEEBEDCEDAAACBB*EBACDAEAEBBEBBBBABADAC,CEECBADAADBEEADEDCACCADECCECDEADEADCEEDEAAAEB,99999EEACCDBCCECCAEEBBDAACACDCBABBEDEEBBDBEEBD...,AEBBEBCBADEEAEBCEBACACDAEEAAADEECDDBCCECEEDAB,DEADBAAAEBEECEBCBCBCBDADAEABCEDDDDADCBEECACBC,EEBCEEDBADBBCBABCCADCEBACDBBACCACACBEADBBADCB,ADBBEDCABAABBCBCDAAECDDDBAAAECADECDCEBDEEAECBD...,AADDDBEEEBEDDBEBACABCDBABECECACAECDCBDCCEDCDA,373.9,507.8,516.4,462.1,740.0
4,1,AABCBEDBBCCADACABBBACDBDDCADDBCABAEAEAADDDAAE,BEECCEABADBDABABEAACCDBCEDECABBAEBDDCBEACBDBE,99999ECABADBDCECDAAECCDBBBAAECAEDCACCEDEBAECDE...,DEBBACABDDADBDCECACBCBECCECECEEBADDBBACDEADCB,DADCCEBBCCACBEEBEEBACBCDDDDADBCBBCEAEADEADAAE,EEBCEEDBADBBCBABCCADCEBACDBBACCACACBEADBBADCB,ADBBEDCABAABBCBCDAAECDDDBAAAECADECDCEBDEEAECBD...,DBEBACABCDBABECEEEDCBDCCEDCDABEDAADDDECACAECB,579.3,549.2,598.8,623.3,840.0


In [228]:
subset = subset.dropna()

In [229]:
subset['TX_RESPOSTAS_LC'] = subset['TX_RESPOSTAS_LC'].str.replace('9', '')

In [230]:
subset.loc[subset.TP_LINGUA == 0, 'TX_GABARITO_LC'] = subset.query('TP_LINGUA ==0')['TX_GABARITO_LC'].str.slice(stop=5)+subset['TX_GABARITO_LC'].str.slice(start=10)

In [231]:
subset.loc[subset.TP_LINGUA == 1, 'TX_GABARITO_LC'] = subset['TX_GABARITO_LC'].str.slice(start=5)

In [232]:
subset['ACERTOS_CN'] = subset.apply(lambda x: np.sum(np.array(list(x['TX_RESPOSTAS_CN'])) == np.array(list(x['TX_GABARITO_CN']))), axis=1)
subset['ACERTOS_CH'] = subset.apply(lambda x: np.sum(np.array(list(x['TX_RESPOSTAS_CH'])) == np.array(list(x['TX_GABARITO_CH']))), axis=1)
subset['ACERTOS_LC'] = subset.apply(lambda x: np.sum(np.array(list(x['TX_RESPOSTAS_LC'])) == np.array(list(x['TX_GABARITO_LC']))), axis=1)
subset['ACERTOS_MT'] = subset.apply(lambda x: np.sum(np.array(list(x['TX_RESPOSTAS_MT'])) == np.array(list(x['TX_GABARITO_MT']))), axis=1)

In [233]:
subset['SOMA_NOTAS_OBJ'] = subset[scores].sum(axis=1)

In [234]:
subset = subset.drop(columns=foreign_language+answers+correct_answers)

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO,ACERTOS_CN,ACERTOS_CH,ACERTOS_LC,ACERTOS_MT,SOMA_NOTAS_OBJ
2,456.4,576.8,523.1,583.2,860.0,12,25,19,10,2139.5
3,373.9,507.8,516.4,462.1,740.0,9,15,15,9,1860.2
4,579.3,549.2,598.8,623.3,840.0,22,17,28,19,2350.6
5,484.0,506.2,503.6,610.1,580.0,16,13,16,15,2103.9
6,409.5,445.9,432.2,439.3,640.0,10,10,11,5,1726.9


In [218]:
subset.shape

(36982, 8)

In [238]:
hits = ['ACERTOS_CN', 'ACERTOS_CH', 'ACERTOS_LC', 'ACERTOS_MT']
subset = subset[scores + hits + ['SOMA_NOTAS_OBJ', 'NU_NOTA_REDACAO']]
subset.head()

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,ACERTOS_CN,ACERTOS_CH,ACERTOS_LC,ACERTOS_MT,SOMA_NOTAS_OBJ,NU_NOTA_REDACAO
2,456.4,576.8,523.1,583.2,12,25,19,10,2139.5,860.0
3,373.9,507.8,516.4,462.1,9,15,15,9,1860.2,740.0
4,579.3,549.2,598.8,623.3,22,17,28,19,2350.6,840.0
5,484.0,506.2,503.6,610.1,16,13,16,15,2103.9,580.0
6,409.5,445.9,432.2,439.3,10,10,11,5,1726.9,640.0


## Machine Learning