In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option('display.max_colwidth', 1000)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from mlxtend.frequent_patterns import fpmax, fpgrowth, apriori, association_rules
from enadepy.frequent import freq_itemsets_sort, find_itemsets_all, find_itemsets_any, \
closed_freq_itemsets_sort, association_rules_ext, filter_rules, find_itemsets_without

In [3]:
dfw = pd.read_csv('../data/preprocessed/enade_2016a2018_pub_onehot_white.csv')
dfb = pd.read_csv('../data/preprocessed/enade_2016a2018_pub_onehot_nowhite.csv')

In [4]:
dfw.shape

(193, 64)

In [5]:
dfb.shape

(49, 60)

In [6]:
dfw.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I06_E,QE_I06_F,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I08_G,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I17_A,QE_I17_B,QE_I17_C,QE_I17_D,QE_I17_E,QE_I17_F,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_G,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [7]:
dfb.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_B,QE_I06_C,QE_I06_D,QE_I06_E,QE_I06_F,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I17_A,QE_I17_B,QE_I17_D,QE_I17_F,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


## Analysis considering white students (dfw dataframe)

### Generate frequent itemsets

In [8]:
support=0.05

In [9]:
iset = closed_freq_itemsets_sort(dfw, sort_by='length', min_support=support, use_colnames=True)

In [10]:
len(iset)

2030

In [11]:
iset.head()

Unnamed: 0,support,itemsets,isclosed,length
1154,0.051813,"(QE_I07_D, TP_SEXO_F, QE_I05_E, QE_I17_B, QE_I09_B, QE_I06_B)",True,6
1395,0.056995,"(NT_GER_Q1, TP_SEXO_F, QE_I08_D, QE_I17_B, QE_I09_B, QE_I06_D)",True,6
917,0.051813,"(NT_GER_Q1, QE_I04_E, TP_SEXO_F, QE_I23_B, QE_I17_B, QE_I09_B)",True,6
1232,0.051813,"(NT_GER_Q1, TP_SEXO_F, QE_I05_E, QE_I23_B, QE_I17_B, QE_I09_B)",True,6
1281,0.056995,"(NT_GER_Q1, QE_I04_E, TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I06_D)",True,6


In [12]:
iset.tail()

Unnamed: 0,support,itemsets,isclosed,length
20,0.735751,(TP_SEXO_F),True,1
19,0.150259,(QE_I08_B),True,1
18,0.217617,(QE_I07_C),True,1
17,0.274611,(QE_I22_C),True,1
0,0.704663,(QE_I09_B),True,1


### Discover association rules

In [13]:
use_metric='lift'
threshold=1.1

In [14]:
rules = association_rules_ext(iset, metric=use_metric, min_threshold=threshold)

#### Summary of the metrics of interest

In [15]:
summary = rules.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,13366.0,0.0696,0.02641,0.051813,0.051813,0.062176,0.072539,0.455959
confidence,13366.0,0.394279,0.240548,0.070423,0.2,0.333333,0.55,1.0
lift,13366.0,1.627378,0.488616,1.100268,1.27431,1.489825,1.828717,5.848485
conviction,13366.0,inf,,1.008871,1.069796,1.168252,1.444799,inf


Get latex code for the table.

In [16]:
print(summary.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,07 & 0,03 & 0,05 & 0,05 & 0,06 & 0,07 & 0,46 \\
confidence &  0,39 & 0,24 & 0,07 & 0,20 & 0,33 & 0,55 & 1,00 \\
lift       &  1,63 & 0,49 & 1,10 & 1,27 & 1,49 & 1,83 & 5,85 \\
conviction &   inf &  NaN & 1,01 & 1,07 & 1,17 & 1,44 &  inf \\
\bottomrule
\end{tabular}



In [17]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
0,"(QE_I07_D, TP_SEXO_F, QE_I05_E, QE_I17_B, QE_I09_B)",(QE_I06_B),0.07772,0.455959,0.051813,0.666667,1.462121,0.016376,1.632124,5,1,True,True
1,"(QE_I07_D, TP_SEXO_F, QE_I05_E, QE_I17_B, QE_I06_B)",(QE_I09_B),0.051813,0.704663,0.051813,1.0,1.419118,0.015302,inf,5,1,False,True
2,"(QE_I07_D, TP_SEXO_F, QE_I05_E, QE_I09_B, QE_I06_B)",(QE_I17_B),0.056995,0.507772,0.051813,0.909091,1.790353,0.022873,5.414508,5,1,True,True
3,"(QE_I07_D, TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I06_B)",(QE_I05_E),0.082902,0.290155,0.051813,0.625,2.154018,0.027759,1.892919,5,1,True,True
4,"(QE_I07_D, QE_I05_E, QE_I17_B, QE_I09_B, QE_I06_B)",(TP_SEXO_F),0.062176,0.735751,0.051813,0.833333,1.132629,0.006067,1.585492,5,1,True,True


Check rules with greatest support, according to max conviction

In [18]:
max_support = rules.support.quantile(0.8)
max_support

0.07772020725388601

In [19]:
filtered = filter_rules(rules, by=['support', 'conviction', 'lift'])
filtered.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
13363,(QE_I17_B),(QE_I09_B),0.507772,0.704663,0.455959,0.897959,1.27431,0.09815,2.894301,1,1,True,True
12059,"(TP_SEXO_F, QE_I17_B)",(QE_I09_B),0.398964,0.704663,0.352332,0.883117,1.253247,0.071197,2.52677,2,1,True,True
13359,(QE_I25_E),(QE_I09_B),0.393782,0.704663,0.310881,0.789474,1.120356,0.033397,1.40285,1,1,True,True
13364,(NT_GER_Q1),(QE_I17_B),0.492228,0.507772,0.300518,0.610526,1.202363,0.050579,1.263829,1,1,True,True
12063,"(NT_GER_Q1, QE_I17_B)",(QE_I09_B),0.300518,0.704663,0.290155,0.965517,1.370183,0.078391,8.564767,2,1,True,True
12072,"(TP_SEXO_F, NT_GER_Q1)",(QE_I09_B),0.362694,0.704663,0.284974,0.785714,1.115021,0.029397,1.378238,2,1,True,True
13267,(QE_I04_E),(QE_I09_B),0.316062,0.704663,0.274611,0.868852,1.233004,0.051894,2.251943,1,1,True,True
13206,(QE_I04_D),(TP_SEXO_F),0.321244,0.735751,0.26943,0.83871,1.139936,0.033075,1.638342,1,1,True,True
13360,(QE_I06_B),(NT_GER_Q0),0.455959,0.507772,0.259067,0.568182,1.11897,0.027544,1.139896,1,1,True,True
12053,"(TP_SEXO_F, QE_I25_E)",(QE_I09_B),0.305699,0.704663,0.253886,0.830508,1.178589,0.038471,1.742487,2,1,True,True


Check rules with greatest conviction, excluding the previous

In [20]:
filtered_conv = filter_rules(rules.drop(index=filtered.head(10).index), by=['conviction', 'support', 'lift'])
filtered_conv.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
8716,"(NT_GER_Q1, QE_I05_E, QE_I17_B)",(QE_I09_B),0.124352,0.704663,0.124352,1.0,1.419118,0.036726,inf,3,1,False,True
9233,"(TP_SEXO_F, NT_GER_Q1, QE_I05_E)",(QE_I09_B),0.124352,0.704663,0.124352,1.0,1.419118,0.036726,inf,3,1,False,True
9652,"(QE_I07_A, NT_GER_Q1, QE_I17_B)",(QE_I09_B),0.124352,0.704663,0.124352,1.0,1.419118,0.036726,inf,3,1,False,True
3822,"(TP_SEXO_F, NT_GER_Q1, QE_I05_E, QE_I17_B)",(QE_I09_B),0.103627,0.704663,0.103627,1.0,1.419118,0.030605,inf,4,1,False,True
5157,"(QE_I23_B, NT_GER_Q1, QE_I17_B)",(QE_I09_B),0.103627,0.704663,0.103627,1.0,1.419118,0.030605,inf,3,1,False,True
7854,"(NT_GER_Q1, QE_I04_E, QE_I05_E)",(QE_I09_B),0.098446,0.704663,0.098446,1.0,1.419118,0.029075,inf,3,1,False,True
5232,"(NT_GER_Q1, QE_I08_D, QE_I17_B)",(QE_I09_B),0.093264,0.704663,0.093264,1.0,1.419118,0.027544,inf,3,1,False,True
2508,"(QE_I07_A, TP_SEXO_F, NT_GER_Q1, QE_I17_B)",(QE_I09_B),0.088083,0.704663,0.088083,1.0,1.419118,0.026014,inf,4,1,False,True
6829,"(QE_I04_D, NT_GER_Q1, QE_I17_B)",(QE_I09_B),0.088083,0.704663,0.088083,1.0,1.419118,0.026014,inf,3,1,False,True
8622,"(QE_I23_B, QE_I04_E, QE_I17_B)",(QE_I09_B),0.088083,0.704663,0.088083,1.0,1.419118,0.026014,inf,3,1,False,True


In [21]:
find_itemsets_without(filtered_conv, {"QE_I09_B", "QE_I17_B", "TP_SEXO_F"}, col_name="consequents").head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
12074,"(QE_I05_B, TP_SEXO_F)",(QE_I17_A),0.062176,0.404145,0.062176,1.0,2.474359,0.037048,inf,2,1,False,True
1811,"(QE_I23_C, QE_I09_B, QE_I17_B, QE_I07_A)",(NT_GER_Q1),0.062176,0.492228,0.062176,1.0,2.031579,0.031571,inf,4,1,False,True
4302,"(QE_I07_C, QE_I05_D, QE_I25_E)",(QE_I06_B),0.056995,0.455959,0.056995,1.0,2.193182,0.031008,inf,3,1,False,True
9674,"(QE_I07_D, QE_I04_D, QE_I17_A)",(QE_I06_B),0.051813,0.455959,0.051813,1.0,2.193182,0.028189,inf,3,1,False,True
12105,"(TP_SEXO_F, QE_I04_B)",(QE_I17_A),0.082902,0.404145,0.07772,0.9375,2.319712,0.044216,9.533679,2,1,True,True
13338,(QE_I05_B),(QE_I17_A),0.07772,0.404145,0.072539,0.933333,2.309402,0.041129,8.937824,1,1,True,True
12520,"(QE_I07_D, QE_I04_D)",(QE_I06_B),0.082902,0.455959,0.07772,0.9375,2.056108,0.039921,8.704663,2,1,True,True
9975,"(QE_I07_A, QE_I09_B, QE_I06_D)",(NT_GER_Q1),0.088083,0.492228,0.082902,0.941176,1.912074,0.039545,8.632124,3,1,True,True
8781,"(NT_GER_Q1, QE_I06_A, QE_I17_B)",(QE_I07_A),0.056995,0.238342,0.051813,0.909091,3.814229,0.038229,8.378238,3,1,False,True
8107,"(TP_SEXO_F, NT_GER_Q0, QE_I06_C)",(QE_I17_A),0.072539,0.404145,0.067358,0.928571,2.297619,0.038041,8.341969,3,1,True,True


#### Associations containing grade information:

In [22]:
grades_rules = find_itemsets_any(rules, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(grades_rules).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
1811,"(QE_I23_C, QE_I09_B, QE_I17_B, QE_I07_A)",(NT_GER_Q1),0.062176,0.492228,0.062176,1.0,2.031579,0.031571,inf,4,1,False,True
67,"(TP_SEXO_F, QE_I08_D, QE_I17_B, QE_I09_B, QE_I06_D)",(NT_GER_Q1),0.056995,0.492228,0.056995,1.0,2.031579,0.02894,inf,5,1,False,True
9975,"(QE_I07_A, QE_I09_B, QE_I06_D)",(NT_GER_Q1),0.088083,0.492228,0.082902,0.941176,1.912074,0.039545,8.632124,3,1,True,True
1642,"(QE_I07_A, QE_I09_B, QE_I17_B, QE_I06_D)",(NT_GER_Q1),0.07772,0.492228,0.072539,0.933333,1.89614,0.034283,7.61658,4,1,True,True
987,"(QE_I07_A, TP_SEXO_F, QE_I09_B, QE_I06_D)",(NT_GER_Q1),0.067358,0.492228,0.062176,0.923077,1.875304,0.029021,6.601036,4,1,True,True
1240,"(QE_I09_B, QE_I08_D, QE_I17_B, QE_I06_D)",(NT_GER_Q1),0.067358,0.492228,0.062176,0.923077,1.875304,0.029021,6.601036,4,1,True,True
12681,"(QE_I07_D, QE_I05_D)",(NT_GER_Q0),0.067358,0.507772,0.062176,0.923077,1.817896,0.027974,6.398964,2,1,True,True
500,"(QE_I07_A, TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I06_D)",(NT_GER_Q1),0.062176,0.492228,0.056995,0.916667,1.862281,0.02639,6.093264,5,1,True,True
1464,"(TP_SEXO_F, QE_I09_B, QE_I08_D, QE_I06_D)",(NT_GER_Q1),0.062176,0.492228,0.056995,0.916667,1.862281,0.02639,6.093264,4,1,True,True
1494,"(TP_SEXO_F, QE_I08_D, QE_I17_B, QE_I06_D)",(NT_GER_Q1),0.062176,0.492228,0.056995,0.916667,1.862281,0.02639,6.093264,4,1,True,True


In [23]:
grades_rules = find_itemsets_any(rules, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(grades_rules, by=['support', 'conviction']).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
13365,(QE_I17_B),(NT_GER_Q1),0.507772,0.492228,0.300518,0.591837,1.202363,0.050579,1.244041,1,1,True,True
12067,(QE_I17_B),"(NT_GER_Q1, QE_I09_B)",0.507772,0.373057,0.290155,0.571429,1.531746,0.100728,1.462867,1,2,True,True
12073,(QE_I09_B),"(TP_SEXO_F, NT_GER_Q1)",0.704663,0.362694,0.284974,0.404412,1.115021,0.029397,1.070044,1,2,True,True
13360,(QE_I06_B),(NT_GER_Q0),0.455959,0.507772,0.259067,0.568182,1.11897,0.027544,1.139896,1,1,True,True
13284,(QE_I17_A),(NT_GER_Q0),0.404145,0.507772,0.238342,0.589744,1.161434,0.033128,1.199806,1,1,True,True
12069,"(TP_SEXO_F, QE_I17_B)",(NT_GER_Q1),0.398964,0.492228,0.233161,0.584416,1.187286,0.03678,1.221826,2,1,True,True
8917,"(TP_SEXO_F, QE_I17_B)","(NT_GER_Q1, QE_I09_B)",0.398964,0.373057,0.222798,0.558442,1.496934,0.073962,1.419842,2,2,True,True
13341,(QE_I05_D),(NT_GER_Q0),0.331606,0.507772,0.196891,0.59375,1.169324,0.028511,1.211638,1,1,True,True
13245,(QE_I04_E),(NT_GER_Q1),0.316062,0.492228,0.19171,0.606557,1.232269,0.036135,1.290587,1,1,True,True
13208,(QE_I04_D),(NT_GER_Q0),0.321244,0.507772,0.186528,0.580645,1.143515,0.02341,1.173774,1,1,True,True


#### Interesting rules according to max. support

In [24]:
#t1 = rules.sort_values(by=[ 'support'], ascending=False).head(10)

#### Interesting rules according to max. confidence

In [25]:
#t2 = rules.sort_values(by=[ 'confidence', 'support'], ascending=False).head(10)

#### Interesting rules according to max. lift

In [26]:
#t3 = rules.sort_values(by=[ 'lift', 'support'], ascending=False).head(10)

#### Interesting rules according to max. conviction

In [27]:
#t4 = rules.sort_values(by=['conviction', 'support'], ascending=False).head(10)

Concatenate all top rules and remove duplicates

In [28]:
#rules_all = pd.concat([t1, t2, t3, t4])

In [29]:
#len(rules_all)

In [30]:
#len(rules_all.drop_duplicates())

In [31]:
#rules_all.drop_duplicates()

## Analysis considering no-white students (dfb dataframe)

### Generate frequent itemsets

In [32]:
support=0.05

In [33]:
iset_b = closed_freq_itemsets_sort(dfb, sort_by='length', min_support=support, use_colnames=True)

In [34]:
len(iset_b)

2818

In [35]:
iset_b.head()

Unnamed: 0,support,itemsets,isclosed,length
1039,0.061224,"(QE_I04_D, QE_I25_E, QE_I05_D, TP_SEXO_F, QE_I09_C, QE_I17_A, QE_I06_B)",True,7
1741,0.061224,"(QE_I04_D, QE_I05_D, QE_I07_C, QE_I23_B, QE_I09_C, QE_I17_A, QE_I06_B)",True,7
209,0.061224,"(QE_I04_D, QE_I05_D, TP_SEXO_F, QE_I23_B, QE_I09_C, QE_I17_A, QE_I06_B)",True,7
2046,0.061224,"(QE_I23_C, QE_I05_B, TP_SEXO_F, NT_GER_Q0, QE_I25_H, QE_I17_A, QE_I06_B)",True,7
225,0.061224,"(QE_I04_D, NT_GER_Q1, QE_I05_D, QE_I23_B, QE_I09_C, QE_I17_A, QE_I06_B)",True,7


In [36]:
iset_b.tail()

Unnamed: 0,support,itemsets,isclosed,length
17,0.102041,(QE_I04_E),True,1
16,0.142857,(QE_I07_B),False,1
44,0.102041,(QE_I07_F),False,1
45,0.081633,(QE_I09_E),True,1
0,0.734694,(TP_SEXO_F),True,1


### Discover association rules

In [37]:
use_metric_b='lift'
threshold_b=1.1

In [38]:
rules_b = association_rules_ext(iset_b, metric=use_metric_b, min_threshold=threshold_b)

#### Summary of the metrics of interest

In [39]:
summary_b = rules_b.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary_b

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,32510.0,0.070284,0.0208,0.061224,0.061224,0.061224,0.081633,0.489796
confidence,32510.0,0.484336,0.275952,0.083333,0.25,0.428571,0.714286,1.0
lift,32510.0,2.740376,1.768976,1.1025,1.615385,2.153846,3.266667,16.333333
conviction,32510.0,inf,,1.016112,1.142857,1.392857,2.193878,inf


Get latex code for the table.

In [40]:
print(summary_b.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &   max \\
\midrule
support    &  0,07 & 0,02 & 0,06 & 0,06 & 0,06 & 0,08 &  0,49 \\
confidence &  0,48 & 0,28 & 0,08 & 0,25 & 0,43 & 0,71 &  1,00 \\
lift       &  2,74 & 1,77 & 1,10 & 1,62 & 2,15 & 3,27 & 16,33 \\
conviction &   inf &  NaN & 1,02 & 1,14 & 1,39 & 2,19 &   inf \\
\bottomrule
\end{tabular}



In [41]:
rules_b.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
0,"(QE_I04_D, QE_I25_E, QE_I05_D, TP_SEXO_F, QE_I09_C, QE_I17_A)",(QE_I06_B),0.081633,0.44898,0.061224,0.75,1.670455,0.024573,2.204082,6,1,True,True
1,"(QE_I04_D, QE_I25_E, QE_I05_D, TP_SEXO_F, QE_I09_C, QE_I06_B)",(QE_I17_A),0.061224,0.571429,0.061224,1.0,1.75,0.026239,inf,6,1,False,True
2,"(QE_I04_D, QE_I25_E, QE_I05_D, TP_SEXO_F, QE_I17_A, QE_I06_B)",(QE_I09_C),0.061224,0.387755,0.061224,1.0,2.578947,0.037484,inf,6,1,False,True
3,"(QE_I04_D, QE_I25_E, QE_I05_D, QE_I09_C, QE_I17_A, QE_I06_B)",(TP_SEXO_F),0.061224,0.734694,0.061224,1.0,1.361111,0.016243,inf,6,1,False,True
4,"(QE_I04_D, QE_I25_E, TP_SEXO_F, QE_I09_C, QE_I17_A, QE_I06_B)",(QE_I05_D),0.061224,0.326531,0.061224,1.0,3.0625,0.041233,inf,6,1,False,True


Check rules with greatest support, according to max conviction

In [42]:
max_support_b = rules_b.support.quantile(0.99)
max_support_b

0.16326530612244897

In [43]:
filtered_b = rules_b.query('support >= @max_support_b').sort_values(by='support', ascending=False)
filtered_b.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
32258,(TP_SEXO_F),(QE_I17_A),0.734694,0.571429,0.489796,0.666667,1.166667,0.069971,1.285714,1,1,True,True
32259,(QE_I17_A),(TP_SEXO_F),0.571429,0.734694,0.489796,0.857143,1.166667,0.069971,1.857143,1,1,True,True
32328,(QE_I17_A),(NT_GER_Q0),0.571429,0.510204,0.346939,0.607143,1.19,0.055394,1.246753,1,1,True,True
32329,(NT_GER_Q0),(QE_I17_A),0.510204,0.571429,0.346939,0.68,1.19,0.055394,1.339286,1,1,True,True
28351,(TP_SEXO_F),"(QE_I17_A, NT_GER_Q0)",0.734694,0.346939,0.306122,0.416667,1.20098,0.051229,1.119534,1,2,True,True
28353,(NT_GER_Q0),"(TP_SEXO_F, QE_I17_A)",0.510204,0.489796,0.306122,0.6,1.225,0.056227,1.27551,1,2,True,True
32324,(QE_I17_A),(QE_I06_B),0.571429,0.44898,0.306122,0.535714,1.193182,0.049563,1.186813,1,1,True,True
32325,(QE_I06_B),(QE_I17_A),0.44898,0.571429,0.306122,0.681818,1.193182,0.049563,1.346939,1,1,True,True
28348,"(TP_SEXO_F, QE_I17_A)",(NT_GER_Q0),0.489796,0.510204,0.306122,0.625,1.225,0.056227,1.306122,2,1,True,True
28349,"(TP_SEXO_F, NT_GER_Q0)",(QE_I17_A),0.387755,0.571429,0.306122,0.789474,1.381579,0.084548,2.035714,2,1,True,True


Check rules with greatest conviction, excluding the previous

In [44]:
filtered_conv_b = filter_rules(rules_b.drop(index=filtered_b.head(10).index), by=['conviction', 'support', 'lift']).drop_duplicates(subset='consequents')
filtered_conv_b.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
27725,"(QE_I23_C, QE_I17_A)",(TP_SEXO_F),0.183673,0.734694,0.183673,1.0,1.361111,0.04873,inf,2,1,False,True
22949,"(TP_SEXO_F, QE_I09_C, QE_I06_B)",(QE_I17_A),0.163265,0.571429,0.163265,1.0,1.75,0.069971,inf,3,1,False,True
31013,"(QE_I07_D, QE_I23_B)",(QE_I08_B),0.142857,0.367347,0.142857,1.0,2.722222,0.090379,inf,2,1,False,True
27073,"(NT_GER_Q1, QE_I17_B, QE_I23_B)",(QE_I09_B),0.142857,0.387755,0.142857,1.0,2.578947,0.087464,inf,3,1,False,True
27500,"(QE_I09_C, QE_I06_C)",(NT_GER_Q0),0.142857,0.510204,0.142857,1.0,1.96,0.069971,inf,2,1,False,True
28450,"(QE_I07_A, NT_GER_Q1)",(QE_I06_D),0.122449,0.163265,0.122449,1.0,6.125,0.102457,inf,2,1,False,True
17277,"(NT_GER_Q0, QE_I08_B, QE_I23_B)",(QE_I07_D),0.122449,0.265306,0.122449,1.0,3.769231,0.089963,inf,3,1,False,True
31760,"(TP_SEXO_F, QE_I08_A)",(QE_I23_C),0.122449,0.285714,0.122449,1.0,3.5,0.087464,inf,2,1,False,True
30590,"(QE_I04_D, QE_I22_B)",(QE_I05_D),0.122449,0.326531,0.122449,1.0,3.0625,0.082466,inf,2,1,False,True
20674,"(QE_I25_H, QE_I06_B)","(TP_SEXO_F, QE_I17_A)",0.122449,0.489796,0.122449,1.0,2.041667,0.062474,inf,2,2,False,True


#### Associations containing grade information:

In [45]:
grades_rules_b = find_itemsets_any(rules_b, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(grades_rules_b).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
27500,"(QE_I09_C, QE_I06_C)",(NT_GER_Q0),0.142857,0.510204,0.142857,1.0,1.96,0.069971,inf,2,1,False,True
28218,"(QE_I05_B, QE_I23_C)",(NT_GER_Q0),0.142857,0.510204,0.142857,1.0,1.96,0.069971,inf,2,1,False,True
31882,"(QE_I05_B, QE_I06_C)",(NT_GER_Q0),0.142857,0.510204,0.142857,1.0,1.96,0.069971,inf,2,1,False,True
28464,"(QE_I09_B, QE_I06_D)",(NT_GER_Q1),0.122449,0.489796,0.122449,1.0,2.041667,0.062474,inf,2,1,False,True
25264,"(QE_I05_B, TP_SEXO_F, QE_I23_C)",(NT_GER_Q0),0.122449,0.510204,0.122449,1.0,1.96,0.059975,inf,3,1,False,True
29670,"(QE_I07_D, QE_I09_C)",(NT_GER_Q0),0.122449,0.510204,0.122449,1.0,1.96,0.059975,inf,2,1,False,True
24771,"(QE_I07_A, QE_I09_B)","(NT_GER_Q1, QE_I06_D)",0.102041,0.142857,0.102041,1.0,7.0,0.087464,inf,2,2,False,True
25867,"(QE_I08_A, QE_I06_C)","(QE_I23_C, NT_GER_Q0)",0.102041,0.204082,0.102041,1.0,4.9,0.081216,inf,2,2,False,True
28493,"(QE_I07_A, QE_I09_B)",(NT_GER_Q1),0.102041,0.489796,0.102041,1.0,2.041667,0.052062,inf,2,1,False,True
21079,"(QE_I05_B, QE_I17_A, QE_I04_B)",(NT_GER_Q0),0.102041,0.510204,0.102041,1.0,1.96,0.049979,inf,3,1,False,True


In [46]:
grades_rules_b = find_itemsets_any(rules_b, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(grades_rules_b, by=['support', 'conviction']).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
32328,(QE_I17_A),(NT_GER_Q0),0.571429,0.510204,0.346939,0.607143,1.19,0.055394,1.246753,1,1,True,True
28352,(QE_I17_A),"(TP_SEXO_F, NT_GER_Q0)",0.571429,0.387755,0.306122,0.535714,1.381579,0.084548,1.318681,1,2,True,True
32184,(QE_I06_C),(NT_GER_Q0),0.346939,0.510204,0.265306,0.764706,1.498824,0.088297,2.081633,1,1,True,True
32017,(QE_I17_B),(NT_GER_Q1),0.306122,0.489796,0.244898,0.8,1.633333,0.09496,2.55102,1,1,True,True
32117,(QE_I09_B),(NT_GER_Q1),0.387755,0.489796,0.244898,0.631579,1.289474,0.054977,1.38484,1,1,True,True
32462,(QE_I09_C),(NT_GER_Q0),0.387755,0.510204,0.244898,0.631579,1.237895,0.047064,1.329446,1,1,True,True
32260,(QE_I04_D),(NT_GER_Q1),0.428571,0.489796,0.244898,0.571429,1.166667,0.034985,1.190476,1,1,True,True
32122,(QE_I05_B),(NT_GER_Q0),0.285714,0.510204,0.22449,0.785714,1.54,0.078717,2.285714,1,1,True,True
32391,(QE_I04_B),(NT_GER_Q0),0.244898,0.510204,0.204082,0.833333,1.633333,0.079134,2.938776,1,1,True,True
32008,(QE_I07_D),(NT_GER_Q0),0.265306,0.510204,0.204082,0.769231,1.507692,0.068721,2.122449,1,1,True,True
