In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("mode.chained_assignment", "raise")

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from enadepy import transform
from enadepy.helpers import *
from enadepy.loaders import read_interm

In [3]:
index_co_ies = {
    56: 'UNESP', # pública estadual
    322: 'UNIP', # Privada com fins lucrativos
    845: 'FAC-FEA', # Pública municipal
    1418: 'UNITOLEDO', # Privada com fins lucrativos
    2289: 'UCESP',  # Privada sem fins lucrativos
    4522: 'UNISALESIANO',  # Privada sem fins lucrativos
    15697: 'FATEC' # pública estadual
}

In [4]:
df = read_interm('../data/interim/microdados_enade_aracatuba_2016a2018_min.csv')

In [5]:
df.shape

(2075, 45)

In [6]:
df.head()

Unnamed: 0,NU_ANO,TP_PRES,NT_GER,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,NU_IDADE,TP_SEXO,ANO_FIM_EM,ANO_IN_GRAD,CO_TURNO_GRADUACAO,TP_INSCRICAO_ADM,TP_INSCRICAO,QE_I01,QE_I02,QE_I03,QE_I04,QE_I05,QE_I06,QE_I07,QE_I08,QE_I09,QE_I10,QE_I11,QE_I12,QE_I13,QE_I14,QE_I15,QE_I16,QE_I17,QE_I18,QE_I19,QE_I20,QE_I21,QE_I22,QE_I23,QE_I24,QE_I25,QE_I26
0,2016,555,52.9,56,10001,10028,6,3161,1,3502804,35,3,26,M,2007,2011,3,,,A,A,A,B,B,E,A,A,B,A,A,B,F,A,A,35,E,A,B,K,A,B,C,D,A,A
1,2016,555,74.4,56,10001,10028,6,3161,1,3502804,35,3,23,M,2010,2012,3,,,A,A,A,D,D,B,C,B,B,A,A,A,A,A,A,35,B,A,B,C,A,C,C,A,E,F
2,2016,555,60.2,56,10001,10028,6,3161,1,3502804,35,3,26,F,2007,2010,3,,,A,A,A,E,E,D,D,D,B,A,A,A,A,A,A,35,B,A,B,E,A,B,D,E,H,F
3,2016,555,70.6,56,10001,10028,6,3161,1,3502804,35,3,25,F,2008,2012,3,,,A,A,A,E,D,D,A,D,B,A,A,A,A,A,A,35,B,A,B,C,A,C,B,E,B,F
4,2016,555,30.9,56,10001,10028,6,3161,1,3502804,35,3,25,F,2008,2011,4,,,A,A,A,B,B,A,A,B,B,A,A,A,C,A,A,35,A,A,B,C,A,B,C,E,E,F


In [7]:
df['TP_PRES'].value_counts()

555    1802
222     273
Name: TP_PRES, dtype: Int64

In [8]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NU_ANO,2075.0,2017.184,0.7679209,2016.0,2017.0,2017.0,2018.0,2018.0
TP_PRES,2075.0,511.1884,112.5873,222.0,555.0,555.0,555.0,555.0
NT_GER,1802.0,46.90316,13.50187,5.4,37.2,46.05,56.9,85.6
CO_IES,2075.0,2351.158,2347.572,56.0,845.0,1418.0,4522.0,15697.0
CO_CATEGAD,2075.0,6523.145,5269.747,2.0,5.0,10005.0,10007.0,17634.0
CO_ORGACAD,2075.0,10021.98,3.386354,10020.0,10020.0,10020.0,10022.0,10028.0
CO_GRUPO,2075.0,1510.725,2322.988,1.0,5.0,23.0,3501.0,6306.0
CO_CURSO,2075.0,746179.5,1231741.0,3161.0,20642.0,96729.0,1147234.0,5000425.0
CO_MODALIDADE,2075.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
CO_MUNIC_CURSO,2075.0,3502804.0,0.0,3502804.0,3502804.0,3502804.0,3502804.0,3502804.0


### Check for NaN values in some columns of interest

In [9]:
df.query('TP_PRES == 555').NT_GER.isna().value_counts()

False    1802
Name: NT_GER, dtype: int64

In [10]:
cols = list_cols_socioecon()
cols.remove('QE_I16')

In [11]:
cols

['QE_I01',
 'QE_I02',
 'QE_I03',
 'QE_I04',
 'QE_I05',
 'QE_I06',
 'QE_I07',
 'QE_I08',
 'QE_I09',
 'QE_I10',
 'QE_I11',
 'QE_I12',
 'QE_I13',
 'QE_I14',
 'QE_I15',
 'QE_I17',
 'QE_I18',
 'QE_I19',
 'QE_I20',
 'QE_I21',
 'QE_I22',
 'QE_I23',
 'QE_I24',
 'QE_I25',
 'QE_I26']

In [12]:
df.query('TP_PRES == 555')[cols].isna().sum()

QE_I01      1
QE_I02      1
QE_I03      1
QE_I04      1
QE_I05      1
QE_I06      1
QE_I07      1
QE_I08      1
QE_I09      1
QE_I10      1
QE_I11      1
QE_I12      1
QE_I13      1
QE_I14      1
QE_I15      1
QE_I17      1
QE_I18      1
QE_I19      1
QE_I20      1
QE_I21      1
QE_I22      1
QE_I23      1
QE_I24      1
QE_I25      1
QE_I26    601
dtype: int64

In [13]:
df.query('TP_PRES == 555')[[x for x in cols if x != 'QE_I26']].isna().any(axis=1).value_counts()

False    1801
True        1
dtype: int64

#### Summary

Among those entries with TP_PRES == 555, several have invalid values for QE_I26 and just one row has NaN values for all attributes.

So, we choose to remove QE_I26 entirely and the null row as well.

In [14]:
cols_keep = [x for x in cols if x != 'QE_I26']
df2 = df.query('TP_PRES == 555').dropna(axis=0, subset=cols_keep).copy()

In [15]:
df2

Unnamed: 0,NU_ANO,TP_PRES,NT_GER,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,NU_IDADE,TP_SEXO,ANO_FIM_EM,ANO_IN_GRAD,CO_TURNO_GRADUACAO,TP_INSCRICAO_ADM,TP_INSCRICAO,QE_I01,QE_I02,QE_I03,QE_I04,QE_I05,QE_I06,QE_I07,QE_I08,QE_I09,QE_I10,QE_I11,QE_I12,QE_I13,QE_I14,QE_I15,QE_I16,QE_I17,QE_I18,QE_I19,QE_I20,QE_I21,QE_I22,QE_I23,QE_I24,QE_I25,QE_I26
0,2016,555,52.9,56,10001,10028,6,3161,1,3502804,35,3,26,M,2007,2011,3,,,A,A,A,B,B,E,A,A,B,A,A,B,F,A,A,35,E,A,B,K,A,B,C,D,A,A
1,2016,555,74.4,56,10001,10028,6,3161,1,3502804,35,3,23,M,2010,2012,3,,,A,A,A,D,D,B,C,B,B,A,A,A,A,A,A,35,B,A,B,C,A,C,C,A,E,F
2,2016,555,60.2,56,10001,10028,6,3161,1,3502804,35,3,26,F,2007,2010,3,,,A,A,A,E,E,D,D,D,B,A,A,A,A,A,A,35,B,A,B,E,A,B,D,E,H,F
3,2016,555,70.6,56,10001,10028,6,3161,1,3502804,35,3,25,F,2008,2012,3,,,A,A,A,E,D,D,A,D,B,A,A,A,A,A,A,35,B,A,B,C,A,C,B,E,B,F
4,2016,555,30.9,56,10001,10028,6,3161,1,3502804,35,3,25,F,2008,2011,4,,,A,A,A,B,B,A,A,B,B,A,A,A,C,A,A,35,A,A,B,C,A,B,C,E,E,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2062,2018,555,48.5,322,10005,10028,2,19524,1,3502804,35,3,24,M,2016,2017,4,0,0,A,A,A,B,C,A,A,B,C,E,B,A,A,A,A,35,A,A,A,C,B,E,D,E,E,
2063,2018,555,27.5,322,10005,10028,2,19524,1,3502804,35,3,27,F,2009,2012,4,0,0,E,D,A,B,B,C,B,A,B,A,E,A,A,A,D,35,A,A,A,A,B,B,A,E,E,
2067,2018,555,48.0,322,10005,10028,2,19524,1,3502804,35,3,23,F,2016,2017,4,0,0,A,A,A,E,D,B,C,D,B,A,B,A,A,A,A,35,E,A,C,A,A,B,B,E,E,C
2072,2018,555,36.2,322,10005,10028,2,19524,1,3502804,35,3,25,F,2011,2013,4,0,0,A,A,A,D,F,A,A,A,C,A,H,A,F,A,A,35,B,A,B,A,A,C,B,E,H,


In [16]:
df2[list_cols_socioecon()].isna().sum()

QE_I01      0
QE_I02      0
QE_I03      0
QE_I04      0
QE_I05      0
QE_I06      0
QE_I07      0
QE_I08      0
QE_I09      0
QE_I10      0
QE_I11      0
QE_I12      0
QE_I13      0
QE_I14      0
QE_I15      0
QE_I16      0
QE_I17      0
QE_I18      0
QE_I19      0
QE_I20      0
QE_I21      0
QE_I22      0
QE_I23      0
QE_I24      0
QE_I25      0
QE_I26    600
dtype: int64

In [17]:
df_dropped = df2.drop(columns=['QE_I26']).copy()

In [18]:
dfmelt = df_dropped[cols_keep].melt(var_name="Questão", value_name="Escolha")

In [19]:
pd.crosstab(index=dfmelt['Questão'], columns=dfmelt['Escolha'])

Escolha,A,B,C,D,E,F,G,H,I,J,K
Questão,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
QE_I01,1478,258,27,6,32,0,0,0,0,0,0
QE_I02,1305,92,55,338,1,10,0,0,0,0,0
QE_I03,1788,8,5,0,0,0,0,0,0,0,0
QE_I04,53,345,311,723,302,67,0,0,0,0,0
QE_I05,29,287,341,698,341,105,0,0,0,0,0
QE_I06,114,1266,317,92,8,4,0,0,0,0,0
QE_I07,149,247,471,570,251,79,20,14,0,0,0
QE_I08,233,552,453,239,215,98,11,0,0,0,0
QE_I09,135,623,554,178,219,92,0,0,0,0,0
QE_I10,804,139,118,160,580,0,0,0,0,0,0


In [20]:
df_dropped['QE_I16'].value_counts()

35    1698
50      21
31      18
99      10
33       9
17       8
52       7
41       7
51       5
43       3
29       3
53       2
27       2
26       2
21       2
15       1
12       1
32       1
11       1
Name: QE_I16, dtype: Int64

In [21]:
df_dropped.drop(columns=['QE_I16'], inplace=True)

In [22]:
cols_keep

['QE_I01',
 'QE_I02',
 'QE_I03',
 'QE_I04',
 'QE_I05',
 'QE_I06',
 'QE_I07',
 'QE_I08',
 'QE_I09',
 'QE_I10',
 'QE_I11',
 'QE_I12',
 'QE_I13',
 'QE_I14',
 'QE_I15',
 'QE_I17',
 'QE_I18',
 'QE_I19',
 'QE_I20',
 'QE_I21',
 'QE_I22',
 'QE_I23',
 'QE_I24',
 'QE_I25']

In [23]:
df_dropped.groupby('CO_IES').nunique().CO_CURSO

CO_IES
56        2
322      15
845       3
1418     19
2289      2
4522     22
15697     1
Name: CO_CURSO, dtype: int64

In [24]:
df_dropped.groupby('CO_IES').size()

CO_IES
56       149
322      301
845       97
1418     653
2289      43
4522     550
15697      8
dtype: int64

In [25]:
len(df_dropped)

1801

In [26]:
print(df_dropped.groupby('CO_IES').agg({'CO_CURSO': ['nunique'], 'TP_PRES': ['size']}).to_latex())

\begin{tabular}{lrr}
\toprule
{} & CO\_CURSO & TP\_PRES \\
{} &  nunique &    size \\
CO\_IES &          &         \\
\midrule
56     &        2 &     149 \\
322    &       15 &     301 \\
845    &        3 &      97 \\
1418   &       19 &     653 \\
2289   &        2 &      43 \\
4522   &       22 &     550 \\
15697  &        1 &       8 \\
\bottomrule
\end{tabular}



In [27]:
df_dropped.groupby('CO_IES').agg({'CO_CURSO': ['nunique'], 'TP_PRES': ['size']})

Unnamed: 0_level_0,CO_CURSO,TP_PRES
Unnamed: 0_level_1,nunique,size
CO_IES,Unnamed: 1_level_2,Unnamed: 2_level_2
56,2,149
322,15,301
845,3,97
1418,19,653
2289,2,43
4522,22,550
15697,1,8


In [28]:
df_dropped.query('CO_IES in [56, 322, 1418, 4522]').groupby(['CO_CATEGAD', 'CO_IES']).size()

CO_CATEGAD  CO_IES
4           322       173
            1418      242
5           4522      215
10001       56        149
10005       322        90
            1418      411
10007       4522      179
10008       322        38
            4522      156
dtype: int64

In [29]:
df_dropped.groupby(['CO_CATEGAD', 'CO_IES']).size()

CO_CATEGAD  CO_IES
2           15697       8
3           845        19
4           322       173
            1418      242
5           2289       11
            4522      215
10001       56        149
10005       322        90
            1418      411
10007       4522      179
10008       322        38
            2289       32
            4522      156
17634       845        78
dtype: int64

In [30]:
df_dropped.loc[:,'CO_IES'].map(index_co_ies,)

0       UNESP
1       UNESP
2       UNESP
3       UNESP
4       UNESP
        ...  
2062     UNIP
2063     UNIP
2067     UNIP
2072     UNIP
2074     UNIP
Name: CO_IES, Length: 1801, dtype: object

In [31]:
df_dropped.columns

Index(['NU_ANO', 'TP_PRES', 'NT_GER', 'CO_IES', 'CO_CATEGAD', 'CO_ORGACAD',
       'CO_GRUPO', 'CO_CURSO', 'CO_MODALIDADE', 'CO_MUNIC_CURSO',
       'CO_UF_CURSO', 'CO_REGIAO_CURSO', 'NU_IDADE', 'TP_SEXO', 'ANO_FIM_EM',
       'ANO_IN_GRAD', 'CO_TURNO_GRADUACAO', 'TP_INSCRICAO_ADM', 'TP_INSCRICAO',
       'QE_I01', 'QE_I02', 'QE_I03', 'QE_I04', 'QE_I05', 'QE_I06', 'QE_I07',
       'QE_I08', 'QE_I09', 'QE_I10', 'QE_I11', 'QE_I12', 'QE_I13', 'QE_I14',
       'QE_I15', 'QE_I17', 'QE_I18', 'QE_I19', 'QE_I20', 'QE_I21', 'QE_I22',
       'QE_I23', 'QE_I24', 'QE_I25'],
      dtype='object')

### Check if data needs some additional filtering

In [32]:
df_dropped.TP_PRES.unique()

<IntegerArray>
[555]
Length: 1, dtype: Int64

In [33]:
df_dropped.loc[df_dropped.TP_PRES==222,:]

Unnamed: 0,NU_ANO,TP_PRES,NT_GER,CO_IES,CO_CATEGAD,CO_ORGACAD,CO_GRUPO,CO_CURSO,CO_MODALIDADE,CO_MUNIC_CURSO,CO_UF_CURSO,CO_REGIAO_CURSO,NU_IDADE,TP_SEXO,ANO_FIM_EM,ANO_IN_GRAD,CO_TURNO_GRADUACAO,TP_INSCRICAO_ADM,TP_INSCRICAO,QE_I01,QE_I02,QE_I03,QE_I04,QE_I05,QE_I06,QE_I07,QE_I08,QE_I09,QE_I10,QE_I11,QE_I12,QE_I13,QE_I14,QE_I15,QE_I17,QE_I18,QE_I19,QE_I20,QE_I21,QE_I22,QE_I23,QE_I24,QE_I25


In [34]:
len(df_dropped.query('NU_ANO==2018').index)

721

In [35]:
dfmelt = df_dropped[cols_keep].melt(var_name="Questão", value_name="Escolha")

In [36]:
pd.crosstab(index=dfmelt['Questão'], columns=dfmelt['Escolha'])

Escolha,A,B,C,D,E,F,G,H,I,J,K
Questão,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
QE_I01,1478,258,27,6,32,0,0,0,0,0,0
QE_I02,1305,92,55,338,1,10,0,0,0,0,0
QE_I03,1788,8,5,0,0,0,0,0,0,0,0
QE_I04,53,345,311,723,302,67,0,0,0,0,0
QE_I05,29,287,341,698,341,105,0,0,0,0,0
QE_I06,114,1266,317,92,8,4,0,0,0,0,0
QE_I07,149,247,471,570,251,79,20,14,0,0,0
QE_I08,233,552,453,239,215,98,11,0,0,0,0
QE_I09,135,623,554,178,219,92,0,0,0,0,0
QE_I10,804,139,118,160,580,0,0,0,0,0,0


In [37]:
df_dropped.drop(columns=['QE_I03'], inplace=True)

### Checking for NaN values for a subset of columns:

In [38]:
df_dropped.drop(columns=['TP_PRES', 'NT_GER', 'CO_CATEGAD', 'CO_ORGACAD', 'CO_CURSO', 'CO_MODALIDADE', 'CO_MUNIC_CURSO',
                        'CO_UF_CURSO', 'CO_REGIAO_CURSO', 'CO_TURNO_GRADUACAO', 'TP_INSCRICAO', 'TP_INSCRICAO_ADM']).isna().sum()

NU_ANO         0
CO_IES         0
CO_GRUPO       0
NU_IDADE       0
TP_SEXO        0
ANO_FIM_EM     0
ANO_IN_GRAD    0
QE_I01         0
QE_I02         0
QE_I04         0
QE_I05         0
QE_I06         0
QE_I07         0
QE_I08         0
QE_I09         0
QE_I10         0
QE_I11         0
QE_I12         0
QE_I13         0
QE_I14         0
QE_I15         0
QE_I17         0
QE_I18         0
QE_I19         0
QE_I20         0
QE_I21         0
QE_I22         0
QE_I23         0
QE_I24         0
QE_I25         0
dtype: int64

No NaN values found, so make a new dataframe to above configuration.

In [39]:
df_dropped2 = df_dropped.drop(columns=['TP_PRES', 'CO_CATEGAD', 'CO_ORGACAD', 'CO_CURSO', 'CO_MODALIDADE', 'CO_MUNIC_CURSO',
                        'CO_UF_CURSO', 'CO_REGIAO_CURSO', 'CO_TURNO_GRADUACAO', 'TP_INSCRICAO', 'TP_INSCRICAO_ADM'])

In [40]:
cat_cols_tmp = ['NU_ANO', 'TP_PRES'] + list_cols_institution() + list_cols_student(exclude=['NU_IDADE', 'TP_INSCRICAO', 'TP_INSCRICAO_ADM']) + list_cols_socioecon()

In [41]:
cat_cols = list(set(cat_cols_tmp).intersection(df_dropped2.columns))
cat_cols.sort()
cat_cols

['ANO_FIM_EM',
 'ANO_IN_GRAD',
 'CO_GRUPO',
 'CO_IES',
 'NU_ANO',
 'QE_I01',
 'QE_I02',
 'QE_I04',
 'QE_I05',
 'QE_I06',
 'QE_I07',
 'QE_I08',
 'QE_I09',
 'QE_I10',
 'QE_I11',
 'QE_I12',
 'QE_I13',
 'QE_I14',
 'QE_I15',
 'QE_I17',
 'QE_I18',
 'QE_I19',
 'QE_I20',
 'QE_I21',
 'QE_I22',
 'QE_I23',
 'QE_I24',
 'QE_I25',
 'TP_SEXO']

In [42]:
df3 = transform.categorize(df_dropped2, cat_cols)

In [43]:
df3[cat_cols].describe()

Unnamed: 0,ANO_FIM_EM,ANO_IN_GRAD,CO_GRUPO,CO_IES,NU_ANO,QE_I01,QE_I02,QE_I04,QE_I05,QE_I06,QE_I07,QE_I08,QE_I09,QE_I10,QE_I11,QE_I12,QE_I13,QE_I14,QE_I15,QE_I17,QE_I18,QE_I19,QE_I20,QE_I21,QE_I22,QE_I23,QE_I24,QE_I25,TP_SEXO
count,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801
unique,38,13,34,7,3,5,6,6,6,6,8,7,6,5,11,6,6,5,6,6,5,7,11,2,5,5,5,8,2
top,2012,2013,2,1418,2018,A,A,D,D,B,D,B,B,A,B,A,A,A,A,A,A,B,C,A,B,B,E,E,F
freq,429,646,319,653,721,1478,1305,723,698,1266,570,552,623,804,751,1697,1491,1769,1569,1201,1663,1283,788,1119,546,693,1168,584,1009


In [44]:
df3.describe()

Unnamed: 0,NT_GER,NU_IDADE
count,1801.0,1801.0
mean,46.915103,25.872293
std,13.4961,6.499343
min,5.4,19.0
25%,37.2,22.0
50%,46.1,23.0
75%,56.9,27.0
max,85.6,64.0


In [45]:
df3.columns

Index(['NU_ANO', 'NT_GER', 'CO_IES', 'CO_GRUPO', 'NU_IDADE', 'TP_SEXO',
       'ANO_FIM_EM', 'ANO_IN_GRAD', 'QE_I01', 'QE_I02', 'QE_I04', 'QE_I05',
       'QE_I06', 'QE_I07', 'QE_I08', 'QE_I09', 'QE_I10', 'QE_I11', 'QE_I12',
       'QE_I13', 'QE_I14', 'QE_I15', 'QE_I17', 'QE_I18', 'QE_I19', 'QE_I20',
       'QE_I21', 'QE_I22', 'QE_I23', 'QE_I24', 'QE_I25'],
      dtype='object')

In [46]:
#sns.pairplot(df3.loc[:,['CO_IES', 'CO_CURSO', 'NU_IDADE', 'TP_SEXO']])

In [47]:
df3 = transform.categorize(df_dropped2, cat_cols, only_current=True)

In [48]:
df3.count()

NU_ANO         1801
NT_GER         1801
CO_IES         1801
CO_GRUPO       1801
NU_IDADE       1801
TP_SEXO        1801
ANO_FIM_EM     1801
ANO_IN_GRAD    1801
QE_I01         1801
QE_I02         1801
QE_I04         1801
QE_I05         1801
QE_I06         1801
QE_I07         1801
QE_I08         1801
QE_I09         1801
QE_I10         1801
QE_I11         1801
QE_I12         1801
QE_I13         1801
QE_I14         1801
QE_I15         1801
QE_I17         1801
QE_I18         1801
QE_I19         1801
QE_I20         1801
QE_I21         1801
QE_I22         1801
QE_I23         1801
QE_I24         1801
QE_I25         1801
dtype: int64

In [49]:
df3[cat_cols].describe()

Unnamed: 0,ANO_FIM_EM,ANO_IN_GRAD,CO_GRUPO,CO_IES,NU_ANO,QE_I01,QE_I02,QE_I04,QE_I05,QE_I06,QE_I07,QE_I08,QE_I09,QE_I10,QE_I11,QE_I12,QE_I13,QE_I14,QE_I15,QE_I17,QE_I18,QE_I19,QE_I20,QE_I21,QE_I22,QE_I23,QE_I24,QE_I25,TP_SEXO
count,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801,1801
unique,38,13,34,7,3,5,6,6,6,6,8,7,6,5,11,6,6,5,6,6,5,7,11,2,5,5,5,8,2
top,2012,2013,2,1418,2018,A,A,D,D,B,D,B,B,A,B,A,A,A,A,A,A,B,C,A,B,B,E,E,F
freq,429,646,319,653,721,1478,1305,723,698,1266,570,552,623,804,751,1697,1491,1769,1569,1201,1663,1283,788,1119,546,693,1168,584,1009


### Split data according to institution type (public vs private)

To find out if a given institution is private or public, use `CO_IES` in a search at https://emec.mec.gov.br/

Public institutions:

In [50]:
df_pub = df_dropped2.query('CO_IES in [56, 845, 15697]')
len(df_pub)

254

Private institutions:

In [51]:
df_priv = df_dropped2.query('CO_IES in [322, 1418, 2289, 4522]')
len(df_priv)

1547

Verify consistence after spliting.

In [52]:
len(df_pub.index) + len(df_priv.index) == len(df_dropped2.index)

True

#### Save intermediate data

In [53]:
df_pub.to_csv('../data/interim/microdados_enade_ata_2016a2018_pub.csv', index=False)

In [54]:
df_priv.to_csv('../data/interim/microdados_enade_ata_2016a2018_priv.csv', index=False)