In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option('display.max_colwidth', 1000)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from mlxtend.frequent_patterns import fpmax, fpgrowth, apriori, association_rules
from enadepy.frequent import freq_itemsets_sort, find_itemsets

In [3]:
df = pd.read_csv('../data/preprocessed/enade_2016a2018_priv_onehot_sel.csv')

In [4]:
df.shape

(1727, 71)

In [5]:
df.head()

Unnamed: 0,QE_I01_A,QE_I01_B,QE_I02_A,QE_I02_D,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I06_B,QE_I06_C,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I07_E,QE_I08_A,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I09_A,QE_I09_B,QE_I09_C,QE_I09_D,QE_I09_E,QE_I09_F,QE_I10_A,QE_I10_B,QE_I10_C,QE_I10_D,QE_I10_E,QE_I11_B,QE_I11_C,QE_I11_E,QE_I11_H,QE_I13_A,QE_I13_F,QE_I15_A,QE_I17_A,QE_I17_B,QE_I18_A,QE_I19_A,QE_I19_B,QE_I20_A,QE_I20_C,QE_I20_G,QE_I20_H,QE_I21_A,QE_I21_B,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_B,QE_I23_C,QE_I23_D,QE_I23_E,QE_I24_A,QE_I24_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_E,QE_I25_H,TP_SEXO_F,TP_SEXO_M
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Generate frequent itemsets

In [6]:
support=0.2

In [7]:
iset = freq_itemsets_sort(df, max=False, sort_by='length', min_support=support, use_colnames=True)

In [8]:
iset.head()

Unnamed: 0,support,itemsets,length
1378,0.200926,"(QE_I01_A, QE_I17_A, QE_I18_A, QE_I02_A, QE_I06_B, QE_I13_A, QE_I19_B, QE_I15_A)",8
1507,0.208454,"(QE_I21_A, QE_I01_A, QE_I18_A, QE_I02_A, QE_I06_B, QE_I13_A, QE_I19_B, QE_I15_A)",8
336,0.212507,"(QE_I01_A, QE_I18_A, QE_I02_A, QE_I06_B, QE_I24_E, QE_I13_A, QE_I19_B, QE_I15_A)",8
401,0.206138,"(QE_I01_A, QE_I18_A, QE_I06_B, QE_I13_A, QE_I19_B, QE_I02_A, TP_SEXO_F)",7
321,0.232195,"(QE_I01_A, QE_I18_A, QE_I02_A, QE_I24_E, QE_I13_A, QE_I19_B, QE_I15_A)",7


In [9]:
iset.tail()

Unnamed: 0,support,itemsets,length
25,0.312102,(QE_I25_E),1
24,0.402432,(QE_I04_D),1
23,0.294731,(QE_I09_B),1
22,0.323683,(QE_I07_D),1
0,0.929357,(QE_I18_A),1


In [10]:
query = {"QE_I14_A"}
find_itemsets(iset, query, False)

Unnamed: 0,support,itemsets,length


### Discover association rules

In [11]:
use_metric='lift'
threshold=1.2

In [12]:
rules = association_rules(iset, metric=use_metric, min_threshold=threshold)

#### Summary of the metrics of interest

In [13]:
summary = rules.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,9444.0,0.24508,0.051325,0.200347,0.211928,0.227562,0.259988,0.600463
confidence,9444.0,0.586379,0.201009,0.245564,0.412402,0.559146,0.751922,0.997468
lift,9444.0,1.264787,0.089483,1.200035,1.220207,1.245112,1.283167,2.355345
conviction,9444.0,1.884841,2.518418,1.054787,1.143349,1.272661,1.622513,72.733063


Get latex code for the table.

In [14]:
print(summary.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &   max \\
\midrule
support    &  0,25 & 0,05 & 0,20 & 0,21 & 0,23 & 0,26 &  0,60 \\
confidence &  0,59 & 0,20 & 0,25 & 0,41 & 0,56 & 0,75 &  1,00 \\
lift       &  1,26 & 0,09 & 1,20 & 1,22 & 1,25 & 1,28 &  2,36 \\
conviction &  1,88 & 2,52 & 1,05 & 1,14 & 1,27 & 1,62 & 72,73 \\
\bottomrule
\end{tabular}



Extend dataframe to store itemsets length.

In [15]:
rules['length ant.'] = rules['antecedents'].apply(lambda x: len(x))
rules['length cons.'] = rules['consequents'].apply(lambda x: len(x))

#### Interesting rules according to max. support

In [16]:
# rules[~rules.consequents.apply(lambda x: x.issuperset({"QE_I18_A"})) & ~rules.antecedents.apply(lambda x: x.issuperset({"QE_I18_A"}))].sort_values(by=[ 'support', 'confidence'], ascending=False).query('`length ant.` > 0 | `length cons.` > 0').head(20)

In [17]:
rules.sort_values(by=[ 'support', 'confidence'], ascending=False).query('`length ant.` > 0 | `length cons.` > 0').head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
9164,"(QE_I15_A, QE_I06_B)","(QE_I01_A, QE_I18_A)",0.63868,0.782281,0.600463,0.940163,1.201822,0.100836,3.638539,2,2
9166,"(QE_I06_B, QE_I18_A)","(QE_I15_A, QE_I01_A)",0.705269,0.706427,0.600463,0.851396,1.205213,0.102242,1.975533,2,2
9165,"(QE_I15_A, QE_I01_A)","(QE_I06_B, QE_I18_A)",0.706427,0.705269,0.600463,0.85,1.205213,0.102242,1.964872,2,2
9167,"(QE_I01_A, QE_I18_A)","(QE_I15_A, QE_I06_B)",0.782281,0.63868,0.600463,0.76758,1.201822,0.100836,1.554597,2,2
9390,"(QE_I19_B, QE_I06_B)",(QE_I01_A),0.577881,0.815866,0.570353,0.986974,1.209726,0.09888,14.135851,2,1


#### Interesting rules according to max. confidence

In [18]:
rules[~rules.consequents.apply(lambda x: x.issuperset({"QE_I01_A"}))].sort_values(by=[ 'confidence', 'support'], ascending=False).query('`length cons.` > 0').head(25)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
7408,"(QE_I19_B, QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.220035,0.737116,0.215402,0.978947,1.328077,0.053211,12.486972,4,1
7928,"(QE_I15_A, QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.239143,0.737116,0.233353,0.975787,1.323789,0.057076,10.857093,4,1
8277,"(QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.262884,0.737116,0.256514,0.975771,1.323768,0.062738,10.849924,3,1
8590,"(QE_I19_B, QE_I01_A, QE_I07_D)",(QE_I06_B),0.233353,0.737116,0.227562,0.975186,1.322974,0.055554,10.59421,3,1
7370,"(QE_I19_B, QE_I15_A, QE_I01_A, QE_I07_D)",(QE_I06_B),0.209612,0.737116,0.204401,0.975138,1.322909,0.049892,10.573763,4,1
8062,"(QE_I13_A, QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.224088,0.737116,0.218298,0.97416,1.321583,0.053119,10.173596,4,1
9307,"(QE_I01_A, QE_I07_D)",(QE_I06_B),0.277939,0.737116,0.270411,0.972917,1.319896,0.065538,9.706472,2,1
8432,"(QE_I15_A, QE_I01_A, QE_I07_D)",(QE_I06_B),0.250724,0.737116,0.243775,0.972286,1.319041,0.058963,9.485717,3,1
8451,"(QE_I01_A, QE_I07_D, QE_I13_A)",(QE_I06_B),0.236248,0.737116,0.229299,0.970588,1.316737,0.055157,8.938043,3,1
8046,"(QE_I15_A, QE_I01_A, QE_I07_D, QE_I13_A)",(QE_I06_B),0.214244,0.737116,0.207875,0.97027,1.316305,0.049952,8.842449,4,1


#### Interesting rules according to max. lift

In [19]:
rules.sort_values(by=[ 'lift', 'confidence'], ascending=False).query('lift >2.1')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
8855,"(QE_I09_B, QE_I01_A)","(QE_I10_A, QE_I06_B)",0.269832,0.322525,0.20498,0.759657,2.355345,0.117952,2.818782,2,2
8852,"(QE_I10_A, QE_I06_B)","(QE_I09_B, QE_I01_A)",0.322525,0.269832,0.20498,0.635548,2.355345,0.117952,2.003466,2,2
8854,"(QE_I06_B, QE_I09_B)","(QE_I10_A, QE_I01_A)",0.251303,0.352635,0.20498,0.815668,2.313069,0.116362,3.511957,2,2
8853,"(QE_I10_A, QE_I01_A)","(QE_I06_B, QE_I09_B)",0.352635,0.251303,0.20498,0.581281,2.313069,0.116362,1.788065,2,2
8847,(QE_I09_B),"(QE_I10_A, QE_I15_A, QE_I01_A)",0.294731,0.303416,0.200926,0.681729,2.246843,0.1115,2.188649,1,3
8840,"(QE_I10_A, QE_I15_A, QE_I01_A)",(QE_I09_B),0.303416,0.294731,0.200926,0.662214,2.246843,0.1115,2.087916,3,1
8845,"(QE_I09_B, QE_I01_A)","(QE_I10_A, QE_I15_A)",0.269832,0.335263,0.200926,0.744635,2.221045,0.110462,2.603086,2,2
8842,"(QE_I10_A, QE_I15_A)","(QE_I09_B, QE_I01_A)",0.335263,0.269832,0.200926,0.599309,2.221045,0.110462,1.822273,2,2
8865,"(QE_I09_B, QE_I18_A)","(QE_I10_A, QE_I06_B)",0.282571,0.322525,0.202085,0.715164,2.217393,0.110948,2.378475,2,2
8862,"(QE_I10_A, QE_I06_B)","(QE_I09_B, QE_I18_A)",0.322525,0.282571,0.202085,0.626571,2.217393,0.110948,1.921192,2,2


#### Interesting rules according to max. conviction

In [20]:
rules[~rules['consequents'].apply(lambda x: x.issuperset({"QE_I01_A"}))].sort_values(by=['conviction', 'length ant.'], ascending=[False, True]).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
7408,"(QE_I19_B, QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.220035,0.737116,0.215402,0.978947,1.328077,0.053211,12.486972,4,1
7928,"(QE_I15_A, QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.239143,0.737116,0.233353,0.975787,1.323789,0.057076,10.857093,4,1
8277,"(QE_I01_A, QE_I07_D, QE_I18_A)",(QE_I06_B),0.262884,0.737116,0.256514,0.975771,1.323768,0.062738,10.849924,3,1
8590,"(QE_I19_B, QE_I01_A, QE_I07_D)",(QE_I06_B),0.233353,0.737116,0.227562,0.975186,1.322974,0.055554,10.59421,3,1
7370,"(QE_I19_B, QE_I15_A, QE_I01_A, QE_I07_D)",(QE_I06_B),0.209612,0.737116,0.204401,0.975138,1.322909,0.049892,10.573763,4,1


In [21]:
rules.sort_values(by=['conviction'], ascending=False).head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
7669,"(QE_I10_A, QE_I06_B, QE_I15_A, QE_I19_B)",(QE_I01_A),0.22872,0.815866,0.228141,0.997468,1.222589,0.041536,72.733063,4,1
5047,"(QE_I19_B, QE_I18_A, QE_I15_A, QE_I10_A, QE_I06_B)",(QE_I01_A),0.224667,0.815866,0.224088,0.997423,1.222533,0.04079,71.444123,5,1
8812,"(QE_I19_B, QE_I06_B, QE_I10_A)",(QE_I01_A),0.265779,0.815866,0.264042,0.993464,1.217681,0.047202,28.172554,3,1
6484,"(QE_I19_B, QE_I20_C, QE_I06_B, QE_I15_A)",(QE_I01_A),0.262884,0.815866,0.261146,0.993392,1.217593,0.046669,27.865663,4,1
7623,"(QE_I10_A, QE_I06_B, QE_I19_B, QE_I18_A)",(QE_I01_A),0.25883,0.815866,0.257093,0.993289,1.217466,0.045922,27.436016,4,1
5434,"(QE_I18_A, QE_I15_A, QE_I06_B, QE_I19_B, QE_I20_C)",(QE_I01_A),0.251303,0.815866,0.249566,0.993088,1.217219,0.044536,26.638101,5,1
5384,"(QE_I15_A, QE_I06_B, QE_I13_A, QE_I19_B, QE_I20_C)",(QE_I01_A),0.236827,0.815866,0.23509,0.992665,1.216702,0.041871,25.103648,5,1
7930,"(QE_I15_A, QE_I06_B, QE_I07_D, QE_I18_A)",(QE_I01_A),0.23509,0.815866,0.233353,0.992611,1.216635,0.041551,24.919514,4,1
5847,"(QE_I19_B, QE_I21_A, QE_I06_B, QE_I15_A)",(QE_I01_A),0.308049,0.815866,0.305732,0.992481,1.216476,0.054406,24.489867,4,1
7647,"(QE_I10_A, QE_I06_B, QE_I19_B, QE_I13_A)",(QE_I01_A),0.22872,0.815866,0.226983,0.992405,1.216383,0.040378,24.244354,4,1


In [22]:
find_itemsets(rules, {"QE_I11_G"}, col_name='antecedents')

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
