# Prever nota de matemática do ENEM 2016

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ppscore as pps
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [2]:
df_train = pd.read_csv('train.csv')
df_train.shape

(13730, 167)

In [3]:
df_test = pd.read_csv('test.csv')
df_test.shape

(4576, 47)

O dataset de teste possui menos colunas que o de treino. Desse modo, as colunas a mais serão descartadas e o target ('NU_NOTA_MT') mantido.

In [4]:
col = list(df_test.columns)
col.append('NU_NOTA_MT')

df_train = df_train[col]
df_train.head()

Unnamed: 0,NU_INSCRICAO,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,...,NU_NOTA_REDACAO,Q001,Q002,Q006,Q024,Q025,Q026,Q027,Q047,NU_NOTA_MT
0,ed50e8aaa58e7a806c337585efee9ca41f1eb1ad,43,RS,24,M,1,1,1,4,1,...,520.0,D,D,C,A,A,C,H,A,399.4
1,2c3acac4b33ec2b195d77e7c04a2d75727fad723,23,CE,17,F,3,1,2,0,2,...,580.0,A,A,B,A,A,A,,A,459.8
2,f4545f8ccb9ff5c8aad7d32951b3f251a26e6568,23,CE,21,F,3,1,3,0,1,...,,D,D,C,A,A,A,,A,
3,3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe,33,RJ,25,F,0,1,1,9,1,...,,H,E,E,C,B,C,F,D,
4,bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268,13,AM,28,M,2,1,1,4,1,...,,E,D,C,A,A,B,F,A,


## Análise do dataset e seleção de parâmetros

In [5]:
df_train.isna().sum()/len(df_train)

NU_INSCRICAO              0.000000
CO_UF_RESIDENCIA          0.000000
SG_UF_RESIDENCIA          0.000000
NU_IDADE                  0.000000
TP_SEXO                   0.000000
TP_COR_RACA               0.000000
TP_NACIONALIDADE          0.000000
TP_ST_CONCLUSAO           0.000000
TP_ANO_CONCLUIU           0.000000
TP_ESCOLA                 0.000000
TP_ENSINO                 0.688128
IN_TREINEIRO              0.000000
TP_DEPENDENCIA_ADM_ESC    0.688128
IN_BAIXA_VISAO            0.000000
IN_CEGUEIRA               0.000000
IN_SURDEZ                 0.000000
IN_DISLEXIA               0.000000
IN_DISCALCULIA            0.000000
IN_SABATISTA              0.000000
IN_GESTANTE               0.000000
IN_IDOSO                  0.000000
TP_PRESENCA_CN            0.000000
TP_PRESENCA_CH            0.000000
TP_PRESENCA_LC            0.000000
CO_PROVA_CN               0.000000
CO_PROVA_CH               0.000000
CO_PROVA_LC               0.000000
CO_PROVA_MT               0.000000
NU_NOTA_CN          

Neste primeiro momento será verificada a correlação entre a nota na prova de matemática e nas demais provas. Nota-se que a porcentagem de dados faltantes nessas colunas é aproximadamente o mesmo. Logo, assume-se que esses candidatos não apareceram para a realizar o ENEM e todos esses campos serão preenchidos com zero. 

In [6]:
df_train = df_train.fillna(0)

### Matriz de correlação

In [7]:
df_train.corr().style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'),axis=1)

  smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
  smax = np.nanmax(s.to_numpy()) if vmax is None else vmax


Unnamed: 0,CO_UF_RESIDENCIA,NU_IDADE,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,IN_TREINEIRO,TP_DEPENDENCIA_ADM_ESC,IN_BAIXA_VISAO,IN_CEGUEIRA,IN_SURDEZ,IN_DISLEXIA,IN_DISCALCULIA,IN_SABATISTA,IN_GESTANTE,IN_IDOSO,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,TP_LINGUA,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,NU_NOTA_MT
CO_UF_RESIDENCIA,1.0,-0.002,-0.23,0.0094,0.026,-0.033,0.041,0.03,-0.035,0.042,0.0014,,-0.011,0.0037,0.0011,-0.03,-0.0095,-0.014,-0.035,-0.035,-0.034,-0.012,-0.0046,-0.0056,-0.096,-0.033,-0.005,-0.0089,-0.0062,-0.0032,0.0042,-0.0042,0.0026
NU_IDADE,-0.002,1.0,0.0097,-0.018,-0.25,0.68,-0.3,-0.19,-0.3,-0.3,0.028,,0.0065,0.00049,-0.0044,0.005,0.0065,0.074,-0.25,-0.25,-0.24,-0.25,-0.24,-0.24,0.082,-0.12,-0.25,-0.25,-0.25,-0.25,-0.21,-0.25,-0.24
TP_COR_RACA,-0.23,0.0097,1.0,0.016,0.00074,0.0039,-0.042,0.0049,0.0097,-0.041,-0.015,,0.0079,-0.01,0.0069,0.017,-0.01,0.0098,-0.024,-0.024,-0.022,-0.059,-0.06,-0.057,0.12,0.00041,-0.056,-0.058,-0.062,-0.063,-0.072,-0.064,-0.07
TP_NACIONALIDADE,0.0094,-0.018,0.016,1.0,-5.4e-05,-0.026,0.024,0.032,-0.015,0.024,0.0094,,0.011,-0.0016,-0.0016,0.013,-0.0058,-0.0023,-0.02,-0.02,-0.024,-0.029,-0.031,-0.034,0.015,-0.012,-0.033,-0.034,-0.032,-0.034,-0.032,-0.034,-0.033
TP_ST_CONCLUSAO,0.026,-0.25,0.00074,-5.4e-05,1.0,-0.59,0.16,0.14,0.53,0.16,0.018,,0.013,-0.0076,0.0021,-0.023,-0.0085,-0.011,0.055,0.055,0.053,0.026,0.022,0.029,-0.025,0.058,0.011,0.0015,-0.0032,-0.00068,-0.017,-0.0011,0.024
TP_ANO_CONCLUIU,-0.033,0.68,0.0039,-0.026,-0.59,1.0,-0.42,-0.39,-0.26,-0.42,-0.00096,,-0.012,0.0049,-0.0057,0.012,0.0089,0.024,-0.22,-0.22,-0.22,-0.2,-0.19,-0.19,0.062,-0.13,-0.18,-0.18,-0.18,-0.18,-0.14,-0.18,-0.19
TP_ESCOLA,0.041,-0.3,-0.042,0.024,0.16,-0.42,1.0,0.81,-0.24,1.0,-0.0075,,0.0016,-0.0054,0.025,-0.008,-0.011,-0.0076,0.17,0.17,0.18,0.19,0.18,0.18,-0.07,0.093,0.18,0.19,0.19,0.19,0.17,0.19,0.19
TP_ENSINO,0.03,-0.19,0.0049,0.032,0.14,-0.39,0.81,1.0,-0.23,0.8,-0.00082,,0.019,-0.005,0.022,0.006,-0.0029,-0.007,0.1,0.1,0.1,0.089,0.088,0.089,-0.028,0.062,0.086,0.084,0.081,0.079,0.059,0.082,0.083
IN_TREINEIRO,-0.035,-0.3,0.0097,-0.015,0.53,-0.26,-0.24,-0.23,1.0,-0.24,-0.014,,-0.0074,-0.0033,-0.0033,0.0038,-0.012,-0.0047,0.095,0.095,0.092,0.081,0.078,0.084,-0.036,0.063,0.08,0.07,0.064,0.069,0.046,0.069,0.079
TP_DEPENDENCIA_ADM_ESC,0.042,-0.3,-0.041,0.024,0.16,-0.42,1.0,0.8,-0.24,1.0,-0.011,,-0.0018,-0.0054,0.025,-0.0078,-0.011,-0.0076,0.17,0.17,0.18,0.18,0.18,0.18,-0.069,0.093,0.18,0.19,0.19,0.18,0.16,0.19,0.19


Apenas as colunas que possuem correlação maior que 80% (positiva ou negativa) com o target serão selecionadas para treinamento do modelo.

In [8]:
pearsoncorr = df_train.corr(method='pearson')
col = pearsoncorr.columns[(pearsoncorr.NU_NOTA_MT>0.80) | (pearsoncorr.NU_NOTA_MT<-0.80)]
col = col.tolist()

df_train = df_train[col]
df_train.head()

Unnamed: 0,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_REDACAO,NU_NOTA_MT
0,1,1,1,436.3,495.4,581.2,120.0,120.0,120.0,80.0,520.0,399.4
1,1,1,1,474.5,544.1,599.0,140.0,120.0,120.0,120.0,580.0,459.8
2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Criação do dataframe de resposta e aplicando as modificações nos dados de teste:

In [9]:
df_test = df_test.fillna(0)

answer = pd.DataFrame()
answer['NU_INSCRICAO'] = df_test['NU_INSCRICAO']

df_test = df_test[df_train.columns[:-1]]
df_test.head()

Unnamed: 0,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_REDACAO
0,1,1,1,464.8,443.5,431.8,120.0,80.0,80.0,100.0,420.0
1,1,1,1,391.1,491.1,548.0,120.0,120.0,120.0,120.0,580.0
2,1,1,1,595.9,622.7,613.6,80.0,40.0,40.0,80.0,320.0
3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,1,592.9,492.6,571.4,100.0,80.0,60.0,80.0,320.0


## Treinamento do classificador

In [10]:
y = df_train['NU_NOTA_MT']
x_train = df_train.drop(['NU_NOTA_MT'],axis=1)

model = LinearRegression().fit(x_train, y)

### Resposta final: 93.14%

In [11]:
answer['NU_NOTA_MT'] = model.predict(df_test)

answer.head()

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,431.922226
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,460.081929
2,b38a03232f43b11c9d0788abaf060f7366053b6d,574.879214
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,-0.729066
4,715494628a50142ce8cb17191cfe6d0f3cae0934,541.847809


In [12]:
answer.to_csv('answer.csv',index=False)