In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy.helpers import *
from enadepy.loaders import read_interm

In [3]:
df = read_interm('../data/interim/microdados_enade_aracatuba_2016a2018.csv')

In [4]:
df.head()

Unnamed: 0,NU_ANO,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,NU_IDADE,TP_SEXO,ANO_FIM_EM,ANO_IN_GRAD,NU_ITEM_OFG,NU_ITEM_OFG_Z,NU_ITEM_OFG_X,NU_ITEM_OFG_N,NU_ITEM_OCE,NU_ITEM_OCE_Z,NU_ITEM_OCE_X,NU_ITEM_OCE_N,DS_VT_GAB_OFG_ORIG,DS_VT_GAB_OFG_FIN,DS_VT_GAB_OCE_ORIG,DS_VT_GAB_OCE_FIN,DS_VT_ESC_OFG,DS_VT_ACE_OFG,DS_VT_ESC_OCE,DS_VT_ACE_OCE,TP_PRES,TP_PR_GER,TP_PR_OB_FG,TP_PR_DI_FG,TP_PR_OB_CE,TP_PR_DI_CE,TP_SFG_D1,TP_SFG_D2,TP_SCE_D1,TP_SCE_D2,TP_SCE_D3,NT_GER,NT_FG,NT_OBJ_FG,NT_DIS_FG,NT_FG_D1,NT_FG_D1_PT,NT_FG_D1_CT,NT_FG_D2,NT_FG_D2_PT,NT_FG_D2_CT,NT_CE,NT_OBJ_CE,NT_DIS_CE,NT_CE_D1,NT_CE_D2,NT_CE_D3,CO_RS_I1,CO_RS_I2,CO_RS_I3,...,QE_I25,QE_I26,QE_I27,QE_I28,QE_I29,QE_I30,QE_I31,QE_I32,QE_I33,QE_I34,QE_I35,QE_I36,QE_I37,QE_I38,QE_I39,QE_I40,QE_I41,QE_I42,QE_I43,QE_I44,QE_I45,QE_I46,QE_I47,QE_I48,QE_I49,QE_I50,QE_I51,QE_I52,QE_I53,QE_I54,QE_I55,QE_I56,QE_I57,QE_I58,QE_I59,QE_I60,QE_I61,QE_I62,QE_I63,QE_I64,QE_I65,QE_I66,QE_I67,QE_I68,TP_INSCRICAO,TP_INSCRICAO_ADM,CO_TURNO_GRADUACAO,QE_I69,QE_I70,QE_I71,QE_I72,QE_I73,QE_I74,QE_I75,QE_I76,QE_I77,QE_I78,QE_I79,QE_I80,QE_I81
0,2016,56,10001,10028,6,3161,1,3502804,35,3,26,M,2007,2011,8,0,0,0,27,0,4,0,CAEBCDAC,CAEBCDAC,BDCDAEDBCCCECEEBAADAEADDABC,BDCXXEDBXCCECEEBAAXAEADDABC,CADCCDAC,11001111,BDCBDCDEEBCECEEBBCEDEBCCCCC,111990109011111100901000001,555,555,555,555,555,555,555,555,555,555,555,52.9,58.2,75.0,33.0,55.0,75.0,50.0,11.0,55.0,0.0,51.1,52.2,45.0,40.0,35.0,60.0,C,B,C,...,A,A,5,6,5,6,6,6,6,6,6,4,3,3,2,4,1,6,6,1,6,2,5,3,5,6,6,3,1,4,5,5,6,6,5,5,5,5,5,6,6,6,5,4,,,3,,,,,,,,,,,,,
1,2016,56,10001,10028,6,3161,1,3502804,35,3,23,M,2010,2012,8,0,0,0,27,0,4,0,CAEBCDAC,CAEBCDAC,BDCDAEDBCCCECEEBAADAEADDABC,BDCXXEDBXCCECEEBAAXAEADDABC,CBEABDAC,10100111,BDCBDEDDEBCECEEBACCAAADDABC,111991109011111110910111111,555,555,555,555,555,555,555,555,555,555,555,74.4,68.1,62.5,76.5,82.0,90.0,80.0,71.0,75.0,70.0,76.5,82.6,41.7,50.0,15.0,60.0,C,C,B,...,E,F,6,6,6,6,6,6,6,6,6,6,5,6,6,5,6,6,6,6,6,5,6,5,6,6,6,6,4,6,6,5,6,6,6,6,6,6,6,6,6,6,6,6,,,3,,,,,,,,,,,,,
2,2016,56,10001,10028,6,3161,1,3502804,35,3,26,F,2007,2010,8,0,0,0,27,0,4,0,CAEBCDAC,CAEBCDAC,BDCDAEDBCCCECEEBAADAEADDABC,BDCXXEDBXCCECEEBAAXAEADDABC,DADABDBE,1000100,BDCAACDCECCECEEAAAEAEBDDACE,111990109111111011911011100,555,555,555,555,555,555,555,555,555,555,555,60.2,30.4,25.0,38.5,51.0,55.0,50.0,26.0,50.0,20.0,70.1,73.9,48.3,50.0,35.0,60.0,E,E,E,...,H,F,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,,,3,,,,,,,,,,,,,
3,2016,56,10001,10028,6,3161,1,3502804,35,3,25,F,2008,2012,8,0,0,0,27,0,4,0,CAEBCDAC,CAEBCDAC,BDCDAEDBCCCECEEBAADAEADDABC,BDCXXEDBXCCECEEBAAXAEADDABC,DBBBADAC,10111,BDCBDDDBEBCECEEBCAEAEADAABE,111990119011111101911110110,555,555,555,555,555,555,555,555,555,555,555,70.6,56.6,50.0,66.5,79.0,75.0,80.0,54.0,70.0,50.0,75.3,78.3,58.3,70.0,25.0,80.0,C,C,B,...,B,F,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,,,3,,,,,,,,,,,,,
4,2016,56,10001,10028,6,3161,1,3502804,35,3,25,F,2008,2011,8,0,0,0,27,0,4,0,CAEBCDAC,CAEBCDAC,BDCDAEDBCCCECEEBAADAEADDABC,BDCXXEDBXCCECEEBAAXAEADDABC,..EBCDAE,111110,CDCBACDAE.................C,011990109000000000900000001,555,555,555,555,555,555,555,555,555,555,555,30.9,68.7,62.5,78.0,94.0,70.0,100.0,62.0,70.0,60.0,18.3,17.4,23.3,70.0,0.0,0.0,.,.,.,...,E,F,6,6,6,6,6,6,6,6,6,6,6,5,6,5,5,6,6,6,6,6,6,3,5,5,6,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,5,4,,,4,,,,,,,,,,,,,


In [5]:
filter = list_cols_exam() + ['TP_PRES', 'NT_GER'] + list_cols_institution() + list_cols_student() + list_cols_socioecon()

In [6]:
filter

['NU_ANO',
 'TP_PRES',
 'NT_GER',
 'CO_IES',
 'CO_CATEGAD',
 'CO_ORGACAD',
 'CO_GRUPO',
 'CO_CURSO',
 'CO_MODALIDADE',
 'CO_MUNIC_CURSO',
 'CO_UF_CURSO',
 'CO_REGIAO_CURSO',
 'NU_IDADE',
 'TP_SEXO',
 'ANO_FIM_EM',
 'ANO_IN_GRAD',
 'CO_TURNO_GRADUACAO',
 'TP_INSCRICAO_ADM',
 'TP_INSCRICAO',
 'QE_I01',
 'QE_I02',
 'QE_I03',
 'QE_I04',
 'QE_I05',
 'QE_I06',
 'QE_I07',
 'QE_I08',
 'QE_I09',
 'QE_I10',
 'QE_I11',
 'QE_I12',
 'QE_I13',
 'QE_I14',
 'QE_I15',
 'QE_I16',
 'QE_I17',
 'QE_I18',
 'QE_I19',
 'QE_I20',
 'QE_I21',
 'QE_I22',
 'QE_I23',
 'QE_I24',
 'QE_I25',
 'QE_I26']

In [7]:
pd.Index(filter).difference(df.columns)

Index([], dtype='object')

In [8]:
df[filter].to_csv('../data/interim/microdados_enade_aracatuba_2016a2018_min.csv', index=False)