In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option('display.max_colwidth', 1000)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from mlxtend.frequent_patterns import fpmax, fpgrowth, apriori, association_rules
from enadepy.frequent import freq_itemsets_sort, find_itemsets_all, find_itemsets_any, \
closed_freq_itemsets_sort, association_rules_ext, filter_rules, find_itemsets_without

In [3]:
dfw = pd.read_csv('../data/preprocessed/enade_2016a2018_priv_onehot_white.csv')
dfb = pd.read_csv('../data/preprocessed/enade_2016a2018_priv_onehot_nowhite.csv')

In [4]:
dfw.shape

(1112, 64)

In [5]:
dfb.shape

(382, 63)

In [6]:
dfw.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I07_H,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I08_G,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I17_A,QE_I17_B,QE_I17_C,QE_I17_D,QE_I17_E,QE_I17_F,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_G,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [7]:
dfb.head()

Unnamed: 0,QE_I04_A,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_A,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I06_F,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I07_F,QE_I07_G,QE_I07_H,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I17_A,QE_I17_B,QE_I17_C,QE_I17_D,QE_I17_E,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_A,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_D,QE_I25_E,QE_I25_F,QE_I25_H,TP_SEXO_F,TP_SEXO_M,NT_GER_Q0,NT_GER_Q1
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


## Analysis considering white students (dfw dataframe)

### Generate frequent itemsets

In [8]:
support=0.05

In [9]:
iset = closed_freq_itemsets_sort(dfw, sort_by='length', min_support=support, use_colnames=True)

In [10]:
len(iset)

1626

In [11]:
iset.head()

Unnamed: 0,support,itemsets,isclosed,length
560,0.052158,"(QE_I05_D, QE_I06_B, QE_I04_D, QE_I17_A, QE_I07_C)",True,5
825,0.058453,"(TP_SEXO_M, QE_I06_B, QE_I04_D, NT_GER_Q0, QE_I17_A)",True,5
1144,0.051259,"(TP_SEXO_F, QE_I06_B, QE_I17_A, QE_I08_B, NT_GER_Q1)",True,5
421,0.053957,"(TP_SEXO_M, QE_I06_B, QE_I25_A, NT_GER_Q0, QE_I17_A)",True,5
696,0.052158,"(TP_SEXO_M, QE_I06_B, NT_GER_Q0, QE_I17_A, QE_I07_D)",True,5


In [12]:
iset.tail()

Unnamed: 0,support,itemsets,isclosed,length
24,0.252698,(QE_I08_C),True,1
23,0.26259,(QE_I23_C),True,1
22,0.283273,(QE_I22_C),True,1
21,0.328237,(QE_I25_E),True,1
0,0.651978,(QE_I17_A),True,1


### Discover association rules

In [13]:
use_metric='lift'
threshold=1.1

In [14]:
rules = association_rules_ext(iset, metric=use_metric, min_threshold=threshold)

#### Summary of the metrics of interest

In [15]:
summary = rules.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,4576.0,0.069633,0.022779,0.05036,0.054856,0.06205,0.075764,0.279676
confidence,4576.0,0.347747,0.204322,0.066508,0.184418,0.301065,0.462566,0.96875
lift,4576.0,1.294537,0.398859,1.100043,1.145267,1.204925,1.325954,14.990852
conviction,4576.0,1.186292,0.36508,1.006702,1.03826,1.080515,1.175512,9.710731


Get latex code for the table.

In [16]:
print(summary.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &   max \\
\midrule
support    &  0,07 & 0,02 & 0,05 & 0,05 & 0,06 & 0,08 &  0,28 \\
confidence &  0,35 & 0,20 & 0,07 & 0,18 & 0,30 & 0,46 &  0,97 \\
lift       &  1,29 & 0,40 & 1,10 & 1,15 & 1,20 & 1,33 & 14,99 \\
conviction &  1,19 & 0,37 & 1,01 & 1,04 & 1,08 & 1,18 &  9,71 \\
\bottomrule
\end{tabular}



In [17]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
0,"(QE_I05_D, QE_I17_A, QE_I04_D, QE_I06_B)",(QE_I07_C),0.127698,0.285971,0.052158,0.408451,1.428293,0.01564,1.207049,4,1,True,True
1,"(QE_I05_D, QE_I04_D, QE_I07_C, QE_I06_B)",(QE_I17_A),0.068345,0.651978,0.052158,0.763158,1.170526,0.007599,1.469424,4,1,True,True
2,"(QE_I05_D, QE_I17_A, QE_I07_C, QE_I06_B)",(QE_I04_D),0.079137,0.418165,0.052158,0.659091,1.576149,0.019066,1.706715,4,1,True,True
3,"(QE_I17_A, QE_I04_D, QE_I07_C, QE_I06_B)",(QE_I05_D),0.077338,0.399281,0.052158,0.674419,1.689084,0.021279,1.845067,4,1,True,True
4,"(QE_I05_D, QE_I04_D, QE_I06_B)","(QE_I17_A, QE_I07_C)",0.194245,0.198741,0.052158,0.268519,1.351098,0.013554,1.095392,3,2,True,True


Check rules with greatest support, according to max conviction

In [18]:
max_support = rules.support.quantile(0.8)
max_support

0.07913669064748201

In [19]:
filtered = filter_rules(rules, by=['support', 'conviction', 'lift'])
filtered.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
4510,(QE_I07_D),(QE_I06_B),0.321043,0.757194,0.279676,0.871148,1.150495,0.036584,1.884384,1,1,True,True
4512,(QE_I09_B),(QE_I06_B),0.321043,0.757194,0.273381,0.851541,1.1246,0.030289,1.635503,1,1,True,True
4384,(QE_I05_D),(QE_I04_D),0.399281,0.418165,0.239209,0.599099,1.432684,0.072243,1.451318,1,1,True,True
4549,(QE_I08_B),(QE_I17_A),0.307554,0.651978,0.235612,0.766082,1.175011,0.035093,1.487792,1,1,True,True
4533,(QE_I25_A),(QE_I17_A),0.280576,0.651978,0.206835,0.737179,1.130681,0.023905,1.32418,1,1,True,True
4515,(QE_I09_B),(TP_SEXO_F),0.321043,0.548561,0.205935,0.641457,1.169344,0.029823,1.259091,1,1,True,True
4495,(QE_I25_E),(NT_GER_Q1),0.328237,0.495504,0.195144,0.594521,1.199831,0.032501,1.244197,1,1,True,True
3916,"(QE_I05_D, QE_I06_B)",(QE_I04_D),0.320144,0.418165,0.194245,0.606742,1.45096,0.060371,1.479522,2,1,True,True
3329,"(QE_I08_B, QE_I06_B)",(QE_I17_A),0.240108,0.651978,0.183453,0.764045,1.171887,0.026908,1.474949,2,1,True,True
3333,"(NT_GER_Q0, TP_SEXO_M)",(QE_I17_A),0.240108,0.651978,0.181655,0.756554,1.160398,0.025109,1.429566,2,1,True,True


Check rules with greatest conviction, excluding the previous

In [20]:
filtered_conv = filter_rules(rules.drop(index=filtered.head(10).index), by=['conviction', 'support', 'lift'])
find_itemsets_without(filtered_conv, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='antecedents').head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
4566,(QE_I06_A),(QE_I07_A),0.055755,0.060252,0.05036,0.903226,14.990852,0.047,9.710731,1,1,True,True
3895,"(QE_I04_C, QE_I08_B)",(QE_I17_A),0.057554,0.651978,0.053957,0.9375,1.437931,0.016433,5.568345,2,1,True,True
4213,"(QE_I23_B, QE_I05_B)",(QE_I17_A),0.064748,0.651978,0.060252,0.930556,1.42728,0.018037,5.011511,2,1,True,True
2366,"(QE_I05_D, QE_I09_B, QE_I07_D)",(QE_I06_B),0.053058,0.757194,0.05036,0.949153,1.253513,0.010185,4.77518,3,1,True,True
2274,"(QE_I09_B, TP_SEXO_M, QE_I17_A)",(QE_I06_B),0.070144,0.757194,0.066547,0.948718,1.252939,0.013434,4.734712,3,1,True,True
4226,"(TP_SEXO_M, QE_I05_B)",(QE_I17_A),0.081835,0.651978,0.07464,0.912088,1.398954,0.021286,3.958746,2,1,True,True
3137,"(QE_I07_D, QE_I04_E)",(QE_I06_B),0.057554,0.757194,0.053957,0.9375,1.238124,0.010377,3.884892,2,1,True,True
2630,"(TP_SEXO_F, QE_I09_C, QE_I07_D)",(QE_I06_B),0.054856,0.757194,0.051259,0.934426,1.234064,0.009722,3.702788,3,1,True,True
4201,"(QE_I05_B, QE_I06_B)",(QE_I17_A),0.093525,0.651978,0.084532,0.903846,1.386313,0.023556,3.619424,2,1,True,True
3462,"(QE_I07_D, QE_I17_B)",(QE_I06_B),0.080036,0.757194,0.07464,0.932584,1.231631,0.014038,3.601619,2,1,True,True


In [21]:
find_itemsets_without(filtered_conv, {"QE_I06_B", "QE_I17_A"}, col_name="consequents").head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
4566,(QE_I06_A),(QE_I07_A),0.055755,0.060252,0.05036,0.903226,14.990852,0.047,9.710731,1,1,True,True
3016,"(QE_I25_E, QE_I23_E)",(QE_I22_E),0.081835,0.230216,0.059353,0.725275,3.150412,0.040513,2.802014,2,1,True,True
2130,"(QE_I22_E, QE_I17_A, NT_GER_Q1)",(QE_I23_E),0.072842,0.215827,0.052158,0.716049,3.317695,0.036437,2.761652,3,1,True,True
2074,"(QE_I22_E, TP_SEXO_F, QE_I17_A)",(QE_I23_E),0.084532,0.215827,0.059353,0.702128,3.253191,0.041108,2.63258,3,1,True,True
2174,"(QE_I22_E, QE_I17_A, QE_I06_B)",(QE_I23_E),0.11241,0.215827,0.076439,0.68,3.150667,0.052178,2.45054,3,1,True,True
3024,"(QE_I22_E, QE_I17_A)",(QE_I23_E),0.148381,0.215827,0.09982,0.672727,3.11697,0.067795,2.396083,2,1,True,True
3249,"(QE_I05_D, QE_I23_E)",(QE_I22_E),0.085432,0.230216,0.056655,0.663158,2.880592,0.036987,2.285297,2,1,True,True
2963,"(NT_GER_Q1, QE_I23_E)",(QE_I22_E),0.114209,0.230216,0.07554,0.661417,2.873031,0.049247,2.273549,2,1,True,True
1552,"(QE_I05_D, NT_GER_Q0, QE_I25_A)",(QE_I04_D),0.076439,0.418165,0.056655,0.741176,1.772448,0.024691,2.247997,3,1,True,True
3263,"(TP_SEXO_M, QE_I23_E)",(QE_I22_E),0.083633,0.230216,0.054856,0.655914,2.849126,0.035602,2.237185,2,1,True,True


#### Associations containing grade information:

In [22]:
grades_rules = find_itemsets_any(rules, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(filter_rules(grades_rules).query('support >= .15'))

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
4547,(QE_I25_A),(NT_GER_Q0),0.280576,0.504496,0.169065,0.602564,1.194387,0.027515,1.246751,1,1,True,True
4495,(QE_I25_E),(NT_GER_Q1),0.328237,0.495504,0.195144,0.594521,1.199831,0.032501,1.244197,1,1,True,True
3334,"(QE_I17_A, TP_SEXO_M)",(NT_GER_Q0),0.31205,0.504496,0.181655,0.582133,1.153888,0.024226,1.185791,2,1,True,True
4485,(QE_I07_C),(NT_GER_Q1),0.285971,0.495504,0.157374,0.550314,1.110616,0.015674,1.121887,1,1,True,True


In [23]:
grades_rules = find_itemsets_any(rules, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(grades_rules, by=['support', 'conviction']).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
4495,(QE_I25_E),(NT_GER_Q1),0.328237,0.495504,0.195144,0.594521,1.199831,0.032501,1.244197,1,1,True,True
3334,"(QE_I17_A, TP_SEXO_M)",(NT_GER_Q0),0.31205,0.504496,0.181655,0.582133,1.153888,0.024226,1.185791,2,1,True,True
4547,(QE_I25_A),(NT_GER_Q0),0.280576,0.504496,0.169065,0.602564,1.194387,0.027515,1.246751,1,1,True,True
4485,(QE_I07_C),(NT_GER_Q1),0.285971,0.495504,0.157374,0.550314,1.110616,0.015674,1.121887,1,1,True,True
2900,"(QE_I17_A, QE_I04_D)",(NT_GER_Q0),0.266187,0.504496,0.149281,0.560811,1.111625,0.01499,1.128224,2,1,True,True
4122,(QE_I07_D),"(NT_GER_Q0, QE_I06_B)",0.321043,0.38759,0.148381,0.462185,1.192458,0.023948,1.1387,1,2,True,True
3815,(QE_I17_A),"(QE_I05_D, NT_GER_Q0)",0.651978,0.20054,0.144784,0.222069,1.107357,0.014037,1.027675,1,2,True,True
4539,(QE_I17_B),(NT_GER_Q1),0.242806,0.495504,0.140288,0.577778,1.166042,0.019977,1.19486,1,1,True,True
4239,(QE_I09_B),"(NT_GER_Q1, QE_I06_B)",0.321043,0.369604,0.139388,0.434174,1.174699,0.02073,1.114115,1,2,True,True
2733,"(QE_I25_E, QE_I06_B)",(NT_GER_Q1),0.239209,0.495504,0.136691,0.571429,1.153228,0.018162,1.177158,2,1,True,True


#### Interesting rules according to max. support

In [24]:
#t1 = rules.sort_values(by=[ 'support'], ascending=False).head(10)

#### Interesting rules according to max. confidence

In [25]:
#t2 = rules.sort_values(by=[ 'confidence', 'support'], ascending=False).head(10)

#### Interesting rules according to max. lift

In [26]:
#t3 = rules.sort_values(by=[ 'lift', 'support'], ascending=False).head(10)

#### Interesting rules according to max. conviction

In [27]:
#t4 = rules.sort_values(by=['conviction', 'support'], ascending=False).head(10)

Concatenate all top rules and remove duplicates

In [28]:
#rules_all = pd.concat([t1, t2, t3, t4])

In [29]:
#len(rules_all)

In [30]:
#len(rules_all.drop_duplicates())

In [31]:
#rules_all.drop_duplicates()

## Analysis considering no-white students (dfb dataframe)

### Generate frequent itemsets

In [32]:
support=0.05

In [33]:
iset_b = closed_freq_itemsets_sort(dfb, sort_by='length', min_support=support, use_colnames=True)

In [34]:
len(iset_b)

1920

In [35]:
iset_b.head()

Unnamed: 0,support,itemsets,isclosed,length
202,0.054974,"(TP_SEXO_M, QE_I06_B, NT_GER_Q0, QE_I17_A, QE_I07_D)",True,5
292,0.057592,"(TP_SEXO_F, QE_I05_D, QE_I06_B, QE_I25_A, QE_I17_A)",True,5
247,0.068063,"(TP_SEXO_M, QE_I06_B, QE_I17_A, QE_I07_D, NT_GER_Q1)",True,5
153,0.060209,"(TP_SEXO_M, QE_I06_B, QE_I17_A, QE_I08_B, NT_GER_Q1)",True,5
664,0.057592,"(TP_SEXO_M, QE_I06_B, QE_I04_D, NT_GER_Q0, QE_I17_A)",True,5


In [36]:
iset_b.tail()

Unnamed: 0,support,itemsets,isclosed,length
20,0.112565,(QE_I09_D),True,1
19,0.133508,(QE_I07_B),True,1
18,0.282723,(QE_I08_C),True,1
17,0.301047,(QE_I25_E),True,1
0,0.879581,(QE_I17_A),True,1


### Discover association rules

In [37]:
use_metric_b='lift'
threshold_b=1.1

In [38]:
rules_b = association_rules_ext(iset_b, metric=use_metric_b, min_threshold=threshold_b)

#### Summary of the metrics of interest

In [39]:
summary_b = rules_b.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary_b

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,7816.0,0.070861,0.02453,0.052356,0.054974,0.062827,0.075916,0.311518
confidence,7816.0,0.359533,0.214096,0.059524,0.189189,0.3114,0.482143,1.0
lift,7816.0,1.360773,0.412101,1.100038,1.162169,1.248952,1.401064,5.83371
conviction,7816.0,inf,,1.007621,1.044524,1.093416,1.218762,inf


Get latex code for the table.

In [40]:
print(summary_b.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,07 & 0,02 & 0,05 & 0,05 & 0,06 & 0,08 & 0,31 \\
confidence &  0,36 & 0,21 & 0,06 & 0,19 & 0,31 & 0,48 & 1,00 \\
lift       &  1,36 & 0,41 & 1,10 & 1,16 & 1,25 & 1,40 & 5,83 \\
conviction &   inf &  NaN & 1,01 & 1,04 & 1,09 & 1,22 &  inf \\
\bottomrule
\end{tabular}



In [41]:
rules_b.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
0,"(NT_GER_Q0, TP_SEXO_M, QE_I17_A, QE_I06_B)",(QE_I07_D),0.141361,0.335079,0.054974,0.388889,1.16059,0.007607,1.088053,4,1,True,True
1,"(QE_I07_D, NT_GER_Q0, TP_SEXO_M, QE_I17_A)",(QE_I06_B),0.065445,0.704188,0.054974,0.84,1.192862,0.008888,1.848822,4,1,True,True
2,"(QE_I17_A, TP_SEXO_M, QE_I06_B)","(QE_I07_D, NT_GER_Q0)",0.293194,0.146597,0.054974,0.1875,1.279018,0.011993,1.050342,3,2,True,True
3,"(QE_I07_D, NT_GER_Q0, TP_SEXO_M)","(QE_I17_A, QE_I06_B)",0.075916,0.60733,0.054974,0.724138,1.192331,0.008868,1.423429,3,2,True,True
4,"(QE_I07_D, NT_GER_Q0, QE_I17_A)","(TP_SEXO_M, QE_I06_B)",0.123037,0.335079,0.054974,0.446809,1.333444,0.013747,1.201973,3,2,True,True


Check rules with greatest support, according to max conviction

In [42]:
max_support_b = rules_b.support.quantile(0.99)
max_support_b

0.1649214659685864

In [43]:
filtered_b = rules_b.sort_values(by=['support', 'conviction'], ascending=False)
filtered_b.drop_duplicates(subset='consequents').head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
7792,(QE_I05_D),(QE_I06_B),0.387435,0.704188,0.311518,0.804054,1.141817,0.038691,1.509659,1,1,True,True
7793,(QE_I06_B),(QE_I05_D),0.704188,0.387435,0.311518,0.442379,1.141817,0.038691,1.098534,1,1,True,True
7810,(QE_I06_B),(QE_I04_D),0.704188,0.387435,0.306283,0.434944,1.122626,0.033456,1.08408,1,1,True,True
7639,(QE_I06_B),(QE_I07_D),0.704188,0.335079,0.267016,0.379182,1.131622,0.031057,1.071041,1,1,True,True
6778,(QE_I05_D),"(QE_I17_A, QE_I06_B)",0.387435,0.60733,0.264398,0.682432,1.12366,0.029097,1.236493,1,2,True,True
6779,(QE_I06_B),"(QE_I05_D, QE_I17_A)",0.704188,0.332461,0.264398,0.375465,1.12935,0.030283,1.068858,1,2,True,True
6579,(QE_I06_B),"(QE_I17_A, QE_I04_D)",0.704188,0.324607,0.256545,0.364312,1.122317,0.02796,1.06246,1,2,True,True
7633,(QE_I06_B),(QE_I25_A),0.704188,0.314136,0.253927,0.360595,1.147893,0.032716,1.072659,1,1,True,True
7073,(QE_I06_B),"(QE_I17_A, QE_I25_A)",0.704188,0.290576,0.235602,0.334572,1.151412,0.030982,1.066118,1,2,True,True
6745,(QE_I06_B),"(QE_I07_D, QE_I17_A)",0.704188,0.295812,0.232984,0.330855,1.118466,0.024677,1.052371,1,2,True,True


Check rules with greatest conviction, excluding the previous

In [44]:
filtered_conv_b = filter_rules(rules_b.drop(index=filtered_b.head(10).index), by=['conviction', 'support', 'lift']).drop_duplicates(subset='consequents')
filtered_conv_b.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
6192,"(QE_I04_B, NT_GER_Q1)",(QE_I17_A),0.133508,0.879581,0.133508,1.0,1.136905,0.016077,inf,2,1,False,True
2044,"(QE_I05_D, QE_I09_B, NT_GER_Q1)",(QE_I06_B),0.052356,0.704188,0.052356,1.0,1.420074,0.015488,inf,3,1,False,True
6108,"(QE_I09_F, TP_SEXO_M)",(QE_I06_C),0.08377,0.227749,0.075916,0.90625,3.979167,0.056838,8.237347,2,1,True,True
6146,"(QE_I09_F, NT_GER_Q1)",(TP_SEXO_M),0.062827,0.507853,0.057592,0.916667,1.804983,0.025685,5.905759,2,1,True,True
1775,"(QE_I22_E, QE_I25_A, QE_I06_B)",(QE_I23_E),0.062827,0.193717,0.052356,0.833333,4.301802,0.040185,4.837696,3,1,True,True
1057,"(NT_GER_Q0, QE_I25_A, QE_I17_A, QE_I04_D)",(QE_I05_D),0.062827,0.387435,0.052356,0.833333,2.150901,0.028015,3.675393,4,1,True,True
5869,"(QE_I23_B, QE_I25_H)",(TP_SEXO_F),0.073298,0.492147,0.062827,0.857143,1.741641,0.026754,3.554974,2,1,True,True
1903,"(TP_SEXO_F, QE_I23_E, QE_I06_B)",(QE_I22_E),0.094241,0.204188,0.068063,0.722222,3.537037,0.04882,2.864921,3,1,True,True
1370,"(QE_I05_D, QE_I17_A, QE_I22_C, QE_I06_B)",(QE_I04_D),0.070681,0.387435,0.054974,0.777778,2.007508,0.02759,2.756545,4,1,True,True
14,"(QE_I05_D, TP_SEXO_F, QE_I25_A)","(QE_I17_A, QE_I06_B)",0.068063,0.60733,0.057592,0.846154,1.393236,0.016255,2.552356,3,2,True,True


#### Associations containing grade information:

In [45]:
grades_rules_b = find_itemsets_any(rules_b, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='antecedents')
filter_rules(grades_rules_b).query('support > .07').head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
6192,"(QE_I04_B, NT_GER_Q1)",(QE_I17_A),0.133508,0.879581,0.133508,1.0,1.136905,0.016077,inf,2,1,False,True
7011,"(NT_GER_Q0, QE_I05_B)",(QE_I17_A),0.10733,0.879581,0.10733,1.0,1.136905,0.012925,inf,2,1,False,True
5907,"(QE_I25_H, NT_GER_Q1)",(QE_I17_A),0.091623,0.879581,0.091623,1.0,1.136905,0.011033,inf,2,1,False,True
3472,"(TP_SEXO_F, NT_GER_Q1, QE_I09_C)",(QE_I17_A),0.078534,0.879581,0.078534,1.0,1.136905,0.009457,inf,3,1,False,True
4624,"(QE_I04_B, NT_GER_Q1, QE_I06_B)",(QE_I17_A),0.073298,0.879581,0.073298,1.0,1.136905,0.008827,inf,3,1,False,True
4913,"(NT_GER_Q1, QE_I25_A, QE_I08_B)",(QE_I17_A),0.073298,0.879581,0.073298,1.0,1.136905,0.008827,inf,3,1,False,True
1690,"(QE_I04_B, NT_GER_Q0, QE_I05_B)",(QE_I17_A),0.070681,0.879581,0.070681,1.0,1.136905,0.008511,inf,3,1,False,True
4328,"(QE_I04_B, NT_GER_Q1, QE_I08_B)",(QE_I17_A),0.070681,0.879581,0.070681,1.0,1.136905,0.008511,inf,3,1,False,True
4612,"(QE_I04_B, NT_GER_Q1, TP_SEXO_M)",(QE_I17_A),0.070681,0.879581,0.070681,1.0,1.136905,0.008511,inf,3,1,False,True
6570,"(NT_GER_Q1, QE_I25_A)",(QE_I17_A),0.146597,0.879581,0.143979,0.982143,1.116603,0.015035,6.743455,2,1,True,True


In [46]:
grades_rules_b = find_itemsets_any(rules_b, {"NT_GER_Q0", "NT_GER_Q1"}, col_name='consequents')
filter_rules(grades_rules_b, by=['support', 'conviction']).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,A_length,C_length,A_isclosed,C_isclosed
7658,(QE_I23_B),(NT_GER_Q0),0.413613,0.5,0.227749,0.550633,1.101266,0.020942,1.112676,1,1,True,True
7409,"(QE_I17_A, QE_I08_B)",(NT_GER_Q1),0.342932,0.5,0.196335,0.572519,1.145038,0.024869,1.169643,2,1,True,True
7628,(QE_I07_D),(NT_GER_Q1),0.335079,0.5,0.188482,0.5625,1.125,0.020942,1.142857,1,1,True,True
7787,(QE_I22_C),(NT_GER_Q1),0.311518,0.5,0.175393,0.563025,1.12605,0.019634,1.144231,1,1,True,True
7657,(QE_I22_B),(NT_GER_Q0),0.293194,0.5,0.172775,0.589286,1.178571,0.026178,1.217391,1,1,True,True
6810,"(QE_I07_D, QE_I17_A)",(NT_GER_Q1),0.295812,0.5,0.172775,0.584071,1.168142,0.024869,1.202128,2,1,True,True
4758,"(TP_SEXO_F, QE_I17_A)","(NT_GER_Q1, QE_I06_B)",0.426702,0.348168,0.170157,0.398773,1.145348,0.021593,1.08417,2,2,True,True
6782,(QE_I05_D),"(NT_GER_Q1, QE_I06_B)",0.387435,0.348168,0.164921,0.425676,1.222617,0.030029,1.134955,1,2,True,True
6523,"(QE_I17_A, QE_I22_C)",(NT_GER_Q1),0.274869,0.5,0.162304,0.590476,1.180952,0.024869,1.22093,2,1,True,True
6096,"(QE_I23_B, QE_I06_B)",(NT_GER_Q0),0.28534,0.5,0.162304,0.568807,1.137615,0.019634,1.159574,2,1,True,True
