In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option('display.max_colwidth', 1000)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from mlxtend.frequent_patterns import fpmax, fpgrowth, apriori, association_rules
from enadepy.frequent import freq_itemsets_sort

In [3]:
dfw = pd.read_csv('../data/preprocessed/enade_2016a2018_priv_onehot_white.csv')
dfb = pd.read_csv('../data/preprocessed/enade_2016a2018_priv_onehot_nowhite.csv')

In [4]:
dfw.shape

(1112, 64)

In [5]:
dfb.shape

(382, 63)

In [6]:
dfw.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I07_H,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I08_G,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I17_A,QE_I17_B,QE_I17_C,QE_I17_D,QE_I17_E,QE_I17_F,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_G,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [7]:
dfb.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I06_F,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I07_H,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I17_A,QE_I17_B,QE_I17_C,QE_I17_D,QE_I17_E,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_F,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


## Analysis considering white students (dfw dataframe)

### Generate frequent itemsets

In [8]:
support=0.15

In [9]:
iset = freq_itemsets_sort(dfw, max=False, sort_by='length', min_support=support, use_colnames=True)

In [10]:
iset.head()

Unnamed: 0,support,itemsets,length
39,0.170863,"(QE_I17_A, TP_SEXO_F, NT_GER_Q0)",3
51,0.194245,"(QE_I06_B, QE_I05_D, QE_I04_D)",3
111,0.177158,"(QE_I06_B, NT_GER_Q0, QE_I04_D)",3
110,0.160072,"(QE_I06_B, NT_GER_Q1, QE_I04_D)",3
109,0.185252,"(QE_I06_B, TP_SEXO_F, QE_I04_D)",3


In [11]:
iset.tail()

Unnamed: 0,support,itemsets,length
28,0.173561,(QE_I25_C),1
29,0.165468,(QE_I04_E),1
30,0.183453,(QE_I05_C),1
1,0.548561,(TP_SEXO_F),1
0,0.651978,(QE_I17_A),1


### Discover association rules

In [12]:
use_metric='lift'
threshold=1.2

In [13]:
rules = association_rules(iset, metric=use_metric, min_threshold=threshold)

#### Summary of the metrics of interest

In [14]:
summary = rules.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,16.0,0.182442,0.026288,0.160072,0.160971,0.17491,0.194245,0.239209
confidence,16.0,0.509294,0.088077,0.384181,0.420932,0.520868,0.597143,0.606742
lift,16.0,1.396093,0.096668,1.23658,1.359346,1.437639,1.447352,1.514548
conviction,16.0,1.312497,0.129186,1.124431,1.209917,1.292752,1.425423,1.519769


Get latex code for the table.

In [15]:
print(summary.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,18 & 0,03 & 0,16 & 0,16 & 0,17 & 0,19 & 0,24 \\
confidence &  0,51 & 0,09 & 0,38 & 0,42 & 0,52 & 0,60 & 0,61 \\
lift       &  1,40 & 0,10 & 1,24 & 1,36 & 1,44 & 1,45 & 1,51 \\
conviction &  1,31 & 0,13 & 1,12 & 1,21 & 1,29 & 1,43 & 1,52 \\
\bottomrule
\end{tabular}



Extend dataframe to store itemsets length.

In [16]:
rules['length ant.'] = rules['antecedents'].apply(lambda x: len(x))
rules['length cons.'] = rules['consequents'].apply(lambda x: len(x))

#### Interesting rules according to max. support

In [17]:
t1 = rules.sort_values(by=[ 'support'], ascending=False).head(10)

#### Interesting rules according to max. confidence

In [18]:
t2 = rules.sort_values(by=[ 'confidence', 'support'], ascending=False).head(10)

#### Interesting rules according to max. lift

In [19]:
t3 = rules.sort_values(by=[ 'lift', 'support'], ascending=False).head(10)

#### Interesting rules according to max. conviction

In [20]:
t4 = rules.sort_values(by=['conviction', 'support'], ascending=False).head(10)

Concatenate all top rules and remove duplicates

In [21]:
rules_all = pd.concat([t1, t2, t3, t4])

In [22]:
len(rules_all)

40

In [23]:
len(rules_all.drop_duplicates())

15

In [24]:
rules_all.drop_duplicates()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
12,(QE_I05_D),(QE_I04_D),0.399281,0.418165,0.239209,0.599099,1.432684,0.072243,1.451318,1,1
13,(QE_I04_D),(QE_I05_D),0.418165,0.399281,0.239209,0.572043,1.432684,0.072243,1.403691,1,1
0,"(QE_I06_B, QE_I05_D)",(QE_I04_D),0.320144,0.418165,0.194245,0.606742,1.45096,0.060371,1.479522,2,1
1,"(QE_I06_B, QE_I04_D)",(QE_I05_D),0.33723,0.399281,0.194245,0.576,1.442595,0.059595,1.416791,2,1
2,(QE_I05_D),"(QE_I06_B, QE_I04_D)",0.399281,0.33723,0.194245,0.486486,1.442595,0.059595,1.290657,1,2
3,(QE_I04_D),"(QE_I06_B, QE_I05_D)",0.418165,0.320144,0.194245,0.464516,1.45096,0.060371,1.269611,1,2
10,"(QE_I06_B, QE_I17_A)",(QE_I08_B),0.477518,0.307554,0.183453,0.384181,1.249149,0.036591,1.124431,2,1
11,(QE_I08_B),"(QE_I06_B, QE_I17_A)",0.307554,0.477518,0.183453,0.596491,1.249149,0.036591,1.294847,1,2
8,"(QE_I06_B, TP_SEXO_F)",(QE_I09_B),0.419065,0.321043,0.166367,0.396996,1.23658,0.031829,1.125957,2,1
9,(QE_I09_B),"(QE_I06_B, TP_SEXO_F)",0.321043,0.419065,0.166367,0.518207,1.23658,0.031829,1.205778,1,2


## Analysis considering black students (dfb dataframe)

### Generate frequent itemsets

In [25]:
support2=0.15

In [26]:
iset2 = freq_itemsets_sort(dfb, max=False, sort_by='length', min_support=support2, use_colnames=True)

In [27]:
iset2.head()

Unnamed: 0,support,itemsets,length
35,0.151832,"(QE_I06_B, QE_I17_A, TP_SEXO_M, NT_GER_Q1)",4
42,0.170157,"(QE_I06_B, QE_I17_A, TP_SEXO_F, NT_GER_Q1)",4
174,0.170157,"(QE_I17_A, QE_I23_B, NT_GER_Q1)",3
82,0.170157,"(QE_I06_B, NT_GER_Q0, TP_SEXO_M)",3
32,0.32199,"(QE_I06_B, QE_I17_A, NT_GER_Q1)",3


In [28]:
iset2.tail()

Unnamed: 0,support,itemsets,length
26,0.293194,(QE_I22_B),1
27,0.269634,(QE_I04_B),1
28,0.413613,(QE_I23_B),1
1,0.5,(NT_GER_Q1),1
0,0.879581,(QE_I17_A),1


### Discover association rules

In [29]:
use_metric2='lift'
threshold2=1.2

In [30]:
rules2 = association_rules(iset2, metric=use_metric2, min_threshold=threshold2)

#### Summary of the metrics of interest

In [31]:
summary2 = rules2.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary2

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,22.0,0.182294,0.025707,0.15445,0.160995,0.17801,0.194372,0.235602
confidence,22.0,0.514247,0.157856,0.219331,0.434122,0.500235,0.567568,0.863636
lift,22.0,1.339347,0.1129,1.214266,1.228549,1.282444,1.457765,1.500116
conviction,22.0,1.345346,0.28204,1.049576,1.144844,1.269492,1.416558,2.169284


Get latex code for the table.

In [32]:
print(summary2.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,18 & 0,03 & 0,15 & 0,16 & 0,18 & 0,19 & 0,24 \\
confidence &  0,51 & 0,16 & 0,22 & 0,43 & 0,50 & 0,57 & 0,86 \\
lift       &  1,34 & 0,11 & 1,21 & 1,23 & 1,28 & 1,46 & 1,50 \\
conviction &  1,35 & 0,28 & 1,05 & 1,14 & 1,27 & 1,42 & 2,17 \\
\bottomrule
\end{tabular}



Extend dataframe to store itemsets length.

In [33]:
rules2['length ant.'] = rules2['antecedents'].apply(lambda x: len(x))
rules2['length cons.'] = rules2['consequents'].apply(lambda x: len(x))

#### Interesting rules according to max. support

In [34]:
y1 = rules2.sort_values(by=[ 'support'], ascending=False).head(10)

#### Interesting rules according to max. confidence

In [35]:
y2 = rules2.sort_values(by=[ 'confidence', 'support'], ascending=False).head(10)

#### Interesting rules according to max. lift

In [36]:
y3 = rules2.sort_values(by=[ 'lift', 'support'], ascending=False).head(10)

#### Interesting rules according to max. conviction

In [37]:
y4 = rules2.sort_values(by=['conviction', 'support'], ascending=False).head(10)

Concatenate all top rules and remove duplicates

In [38]:
rules_all2 = pd.concat([y1, y2, y3, y4])

In [39]:
len(rules_all2)

40

In [40]:
len(rules_all2.drop_duplicates())

16

In [41]:
rules_all2.drop_duplicates()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
2,"(QE_I06_B, QE_I17_A)",(QE_I25_A),0.60733,0.314136,0.235602,0.387931,1.234914,0.044818,1.120566,2,1
3,(QE_I25_A),"(QE_I06_B, QE_I17_A)",0.314136,0.60733,0.235602,0.75,1.234914,0.044818,1.570681,1,2
21,(QE_I04_D),(QE_I05_D),0.387435,0.387435,0.219895,0.567568,1.464938,0.06979,1.416558,1,1
20,(QE_I05_D),(QE_I04_D),0.387435,0.387435,0.219895,0.567568,1.464938,0.06979,1.416558,1,1
19,(QE_I09_B),(QE_I06_B),0.230366,0.704188,0.198953,0.863636,1.226428,0.036731,2.169284,1,1
18,(QE_I06_B),(QE_I09_B),0.704188,0.230366,0.198953,0.282528,1.226428,0.036731,1.072702,1,1
10,"(QE_I17_A, QE_I05_D)",(QE_I04_D),0.332461,0.387435,0.180628,0.543307,1.40232,0.051821,1.341307,2,1
13,(QE_I04_D),"(QE_I17_A, QE_I05_D)",0.387435,0.332461,0.180628,0.466216,1.40232,0.051821,1.25058,1,2
12,(QE_I05_D),"(QE_I17_A, QE_I04_D)",0.387435,0.324607,0.180628,0.466216,1.436247,0.054864,1.265293,1,2
11,"(QE_I17_A, QE_I04_D)",(QE_I05_D),0.324607,0.387435,0.180628,0.556452,1.436247,0.054864,1.381057,2,1
