In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option('display.max_colwidth', 1000)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from mlxtend.frequent_patterns import fpmax, fpgrowth, apriori, association_rules
from enadepy.frequent import freq_itemsets_sort

In [3]:
dfw = pd.read_csv('../data/preprocessed/enade_2016a2018_pub_onehot_white.csv')
dfb = pd.read_csv('../data/preprocessed/enade_2016a2018_pub_onehot_nowhite.csv')

In [4]:
dfw.shape

(193, 64)

In [5]:
dfb.shape

(49, 60)

In [6]:
dfw.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I06_E,QE_I06_F,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I08_G,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I17_A,QE_I17_B,QE_I17_C,QE_I17_D,QE_I17_E,QE_I17_F,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_G,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [7]:
dfb.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_B,QE_I06_C,QE_I06_D,QE_I06_E,QE_I06_F,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I17_A,QE_I17_B,QE_I17_D,QE_I17_F,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


## Analysis considering white students (dfw dataframe)

### Generate frequent itemsets

In [8]:
support=0.2

In [9]:
iset = freq_itemsets_sort(dfw, max=False, sort_by='length', min_support=support, use_colnames=True)

In [10]:
iset.head()

Unnamed: 0,support,itemsets,length
41,0.222798,"(NT_GER_Q1, TP_SEXO_F, QE_I09_B, QE_I17_B)",4
40,0.233161,"(TP_SEXO_F, QE_I17_B, NT_GER_Q1)",3
47,0.243523,"(QE_I06_B, TP_SEXO_F, QE_I09_B)",3
54,0.253886,"(QE_I25_E, QE_I09_B, TP_SEXO_F)",3
53,0.207254,"(QE_I25_E, QE_I17_B, QE_I09_B)",3


In [11]:
iset.tail()

Unnamed: 0,support,itemsets,length
20,0.202073,(QE_I25_H),1
21,0.326425,(QE_I23_B),1
22,0.404145,(QE_I17_A),1
23,0.222798,(QE_I08_C),1
0,0.704663,(QE_I09_B),1


### Discover association rules

In [12]:
use_metric='lift'
threshold=1.2

In [13]:
rules = association_rules(iset, metric=use_metric, min_threshold=threshold)

#### Summary of the metrics of interest

In [14]:
summary = rules.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,50.0,0.256995,0.058377,0.207254,0.222798,0.227979,0.290155,0.455959
confidence,50.0,0.614615,0.185302,0.294118,0.455705,0.603874,0.73907,0.965517
lift,50.0,1.372572,0.126583,1.202363,1.275413,1.34724,1.496934,1.634963
conviction,50.0,1.91153,1.45875,1.097033,1.241484,1.387893,1.923524,8.564767


Get latex code for the table.

In [15]:
print(summary.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,26 & 0,06 & 0,21 & 0,22 & 0,23 & 0,29 & 0,46 \\
confidence &  0,61 & 0,19 & 0,29 & 0,46 & 0,60 & 0,74 & 0,97 \\
lift       &  1,37 & 0,13 & 1,20 & 1,28 & 1,35 & 1,50 & 1,63 \\
conviction &  1,91 & 1,46 & 1,10 & 1,24 & 1,39 & 1,92 & 8,56 \\
\bottomrule
\end{tabular}



Extend dataframe to store itemsets length.

In [16]:
rules['length ant.'] = rules['antecedents'].apply(lambda x: len(x))
rules['length cons.'] = rules['consequents'].apply(lambda x: len(x))

#### Interesting rules according to max. support

In [17]:
t1 = rules.sort_values(by=[ 'support'], ascending=False).head(10)

#### Interesting rules according to max. confidence

In [18]:
t2 = rules.sort_values(by=[ 'confidence', 'support'], ascending=False).head(10)

#### Interesting rules according to max. lift

In [19]:
t3 = rules.sort_values(by=[ 'lift', 'support'], ascending=False).head(10)

#### Interesting rules according to max. conviction

In [20]:
t4 = rules.sort_values(by=['conviction', 'support'], ascending=False).head(10)

Concatenate all top rules and remove duplicates

In [21]:
rules_all = pd.concat([t1, t2, t3, t4])

In [22]:
len(rules_all)

40

In [23]:
len(rules_all.drop_duplicates())

25

In [24]:
rules_all.drop_duplicates()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
49,(QE_I17_B),(QE_I09_B),0.507772,0.704663,0.455959,0.897959,1.27431,0.09815,2.894301,1,1
48,(QE_I09_B),(QE_I17_B),0.704663,0.507772,0.455959,0.647059,1.27431,0.09815,1.394646,1,1
24,"(TP_SEXO_F, QE_I09_B)",(QE_I17_B),0.544041,0.507772,0.352332,0.647619,1.275413,0.076083,1.396863,2,1
27,(QE_I17_B),"(TP_SEXO_F, QE_I09_B)",0.507772,0.544041,0.352332,0.693878,1.275413,0.076083,1.489465,1,2
26,(QE_I09_B),"(TP_SEXO_F, QE_I17_B)",0.704663,0.398964,0.352332,0.5,1.253247,0.071197,1.202073,1,2
25,"(TP_SEXO_F, QE_I17_B)",(QE_I09_B),0.398964,0.704663,0.352332,0.883117,1.253247,0.071197,2.52677,2,1
37,(NT_GER_Q1),(QE_I17_B),0.492228,0.507772,0.300518,0.610526,1.202363,0.050579,1.263829,1,1
36,(QE_I17_B),(NT_GER_Q1),0.507772,0.492228,0.300518,0.591837,1.202363,0.050579,1.244041,1,1
28,"(QE_I09_B, NT_GER_Q1)",(QE_I17_B),0.373057,0.507772,0.290155,0.777778,1.531746,0.100728,2.215026,2,1
29,"(QE_I17_B, NT_GER_Q1)",(QE_I09_B),0.300518,0.704663,0.290155,0.965517,1.370183,0.078391,8.564767,2,1


## Analysis considering black students (dfb dataframe)

### Generate frequent itemsets

In [25]:
support2=0.2

In [26]:
iset2 = freq_itemsets_sort(dfb, max=False, sort_by='length', min_support=support2, use_colnames=True)

In [27]:
iset2.head()

Unnamed: 0,support,itemsets,length
55,0.204082,"(QE_I08_B, TP_SEXO_F, QE_I23_B)",3
77,0.306122,"(QE_I17_A, TP_SEXO_F, NT_GER_Q0)",3
64,0.204082,"(QE_I17_A, QE_I06_C, TP_SEXO_F)",3
28,0.244898,"(QE_I17_A, TP_SEXO_F, QE_I23_B)",3
70,0.244898,"(QE_I17_A, QE_I09_C, TP_SEXO_F)",3


In [28]:
iset2.tail()

Unnamed: 0,support,itemsets,length
21,0.510204,(NT_GER_Q0),1
23,0.244898,(QE_I04_B),1
24,0.204082,(QE_I22_D),1
25,0.265306,(QE_I07_D),1
0,0.734694,(TP_SEXO_F),1


### Discover association rules

In [29]:
use_metric2='lift'
threshold2=1.2

In [30]:
rules2 = association_rules(iset2, metric=use_metric2, min_threshold=threshold2)

#### Summary of the metrics of interest

In [31]:
summary2 = rules2.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary2

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,94.0,0.2297,0.031237,0.204082,0.204082,0.22449,0.244898,0.306122
confidence,94.0,0.586344,0.163819,0.277778,0.454545,0.578947,0.714286,0.928571
lift,94.0,1.40231,0.160118,1.20098,1.263889,1.381579,1.505769,1.891228
conviction,94.0,1.600116,0.62901,1.073783,1.22449,1.369615,1.714286,4.714286


Get latex code for the table.

In [32]:
print(summary2.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,23 & 0,03 & 0,20 & 0,20 & 0,22 & 0,24 & 0,31 \\
confidence &  0,59 & 0,16 & 0,28 & 0,45 & 0,58 & 0,71 & 0,93 \\
lift       &  1,40 & 0,16 & 1,20 & 1,26 & 1,38 & 1,51 & 1,89 \\
conviction &  1,60 & 0,63 & 1,07 & 1,22 & 1,37 & 1,71 & 4,71 \\
\bottomrule
\end{tabular}



Extend dataframe to store itemsets length.

In [33]:
rules2['length ant.'] = rules2['antecedents'].apply(lambda x: len(x))
rules2['length cons.'] = rules2['consequents'].apply(lambda x: len(x))

#### Interesting rules according to max. support

In [34]:
y1 = rules2.sort_values(by=[ 'support'], ascending=False).head(10)

#### Interesting rules according to max. confidence

In [35]:
y2 = rules2.sort_values(by=[ 'confidence', 'support'], ascending=False).head(10)

#### Interesting rules according to max. lift

In [36]:
y3 = rules2.sort_values(by=[ 'lift', 'support'], ascending=False).head(10)

#### Interesting rules according to max. conviction

In [37]:
y4 = rules2.sort_values(by=['conviction', 'support'], ascending=False).head(10)

Concatenate all top rules and remove duplicates

In [38]:
rules_all2 = pd.concat([y1, y2, y3, y4])

In [39]:
len(rules_all2)

40

In [40]:
len(rules_all2.drop_duplicates())

27

In [41]:
rules_all2.drop_duplicates()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
66,(QE_I17_A),(QE_I09_C),0.571429,0.387755,0.306122,0.535714,1.381579,0.084548,1.318681,1,1
4,"(QE_I17_A, TP_SEXO_F)",(NT_GER_Q0),0.489796,0.510204,0.306122,0.625,1.225,0.056227,1.306122,2,1
5,"(QE_I17_A, NT_GER_Q0)",(TP_SEXO_F),0.346939,0.734694,0.306122,0.882353,1.20098,0.051229,2.255102,2,1
6,"(TP_SEXO_F, NT_GER_Q0)",(QE_I17_A),0.387755,0.571429,0.306122,0.789474,1.381579,0.084548,2.035714,2,1
7,(QE_I17_A),"(TP_SEXO_F, NT_GER_Q0)",0.571429,0.387755,0.306122,0.535714,1.381579,0.084548,1.318681,1,2
8,(TP_SEXO_F),"(QE_I17_A, NT_GER_Q0)",0.734694,0.346939,0.306122,0.416667,1.20098,0.051229,1.119534,1,2
9,(NT_GER_Q0),"(QE_I17_A, TP_SEXO_F)",0.510204,0.489796,0.306122,0.6,1.225,0.056227,1.27551,1,2
67,(QE_I09_C),(QE_I17_A),0.387755,0.571429,0.306122,0.789474,1.381579,0.084548,2.035714,1,1
59,(QE_I23_B),(QE_I08_B),0.510204,0.367347,0.265306,0.52,1.415556,0.077884,1.318027,1,1
64,(QE_I06_C),(NT_GER_Q0),0.346939,0.510204,0.265306,0.764706,1.498824,0.088297,2.081633,1,1
