In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 200)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import transform

In [3]:
df1 = pd.read_csv('../data/raw/microdados_enade_2018.csv', sep=';', nrows=5)
df2 = pd.read_csv('../data/raw/microdados_enade_2017.csv', sep=';', nrows=5)
df3 = pd.read_csv('../data/raw/microdados_enade_2016.csv', sep=';', nrows=5)

In [4]:
df1.shape

(5, 137)

In [5]:
df2.shape

(5, 150)

In [6]:
df3.shape

(5, 141)

Check columns that exist in df1 but not df2

In [7]:
df1.columns.difference(df2.columns)

Index([], dtype='object')

Check columns that exist in df2 but not df1

In [8]:
df2.columns.difference(df1.columns)

Index(['QE_I69', 'QE_I70', 'QE_I71', 'QE_I72', 'QE_I73', 'QE_I74', 'QE_I75',
       'QE_I76', 'QE_I77', 'QE_I78', 'QE_I79', 'QE_I80', 'QE_I81'],
      dtype='object')

This difference corresponds to questions related to licentiate courses that were available in 2017 but not 2018.
Besides this, the attributes are the same for these two years.

Now, check if there are any differences between (2017, 2018) and 2016.

In [9]:
df1.columns.difference(df3.columns)

Index(['ANO_FIM_EM', 'CO_TURNO_GRADUACAO', 'TP_INSCRICAO', 'TP_INSCRICAO_ADM'], dtype='object')

In [10]:
df3.columns.difference(df1.columns)

Index(['AMOSTRA', 'ANO_FIM_2G', 'ID_STATUS', 'IN_GRAD', 'IN_MATUT',
       'IN_NOTURNO', 'IN_VESPER', 'TP_SEMESTRE'],
      dtype='object')

In [11]:
df3 = pd.read_csv('../data/raw/microdados_enade_2016.csv', sep=';')

In [12]:
df3.ID_STATUS.describe()

count    216044.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: ID_STATUS, dtype: float64

In [13]:
df3.IN_GRAD.describe()

count    216044.0
mean          0.0
std           0.0
min           0.0
25%           0.0
50%           0.0
75%           0.0
max           0.0
Name: IN_GRAD, dtype: float64

In [14]:
df3.ANO_IN_GRAD.describe()

count    216044.000000
mean       2012.239164
std           1.567188
min        1980.000000
25%        2012.000000
50%        2012.000000
75%        2013.000000
max        2016.000000
Name: ANO_IN_GRAD, dtype: float64

In [15]:
df1 = pd.read_csv('../data/raw/microdados_enade_2018.csv', sep=';')

In [16]:
df1.TP_INSCRICAO.unique()

array([0, 1])

In [17]:
df1.TP_INSCRICAO_ADM.unique()

array([0, 2, 1])

Based on the dictionaries provided by INEP, we have to do the following to df3 in order to align these dataset:

    * Rename ANO_FIM_2G to ANO_FIM_EM;
    * Remove AMOSTRA;
    * Create CO_TURNO_GRADUACAO in df3 based on IN_MATUT, IN_NOTURNO, IN_VESPER, and remove these three;
    * Remove TP_SEMESTRE;
    * Remove ID_STATUS;
    * Add TP_INSCRICAO and TP_INSCRICAO_ADM with NaN values;
    * Remove IN_GRAD;
    * Change decimal separator from '.' to ',';

Generate the aligned microdata:

In [18]:
transform.align_microdata_2016('../data/raw/microdados_enade_2016.csv', '../data/interim/microdados_enade_2016.csv')