In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option('display.max_colwidth', 1000)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from enadepy import *
from mlxtend.frequent_patterns import fpmax, fpgrowth, apriori, association_rules
from enadepy.frequent import freq_itemsets_sort, find_itemsets

In [3]:
df = pd.read_csv('../data/preprocessed/enade_2016a2018_pub_onehot_sel.csv')

In [4]:
df.shape

(266, 64)

In [5]:
df.head()

Unnamed: 0,QE_I01_A,QE_I01_B,QE_I02_A,QE_I02_D,QE_I04_B,QE_I04_C,QE_I04_D,QE_I04_E,QE_I04_F,QE_I05_B,QE_I05_C,QE_I05_D,QE_I05_E,QE_I05_F,QE_I06_A,QE_I06_B,QE_I06_C,QE_I06_D,QE_I07_A,QE_I07_B,QE_I07_C,QE_I07_D,QE_I08_B,QE_I08_C,QE_I08_D,QE_I08_E,QE_I08_F,QE_I09_B,QE_I09_C,QE_I10_A,QE_I10_E,QE_I11_A,QE_I11_B,QE_I12_A,QE_I13_A,QE_I13_B,QE_I13_C,QE_I17_A,QE_I17_B,QE_I18_A,QE_I19_B,QE_I19_C,QE_I20_A,QE_I20_C,QE_I20_G,QE_I21_A,QE_I21_B,QE_I22_A,QE_I22_B,QE_I22_C,QE_I22_D,QE_I22_E,QE_I23_B,QE_I23_C,QE_I23_D,QE_I24_A,QE_I24_E,QE_I25_A,QE_I25_B,QE_I25_C,QE_I25_E,QE_I25_H,TP_SEXO_F,TP_SEXO_M
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


### Generate frequent itemsets

In [6]:
support=0.2

In [7]:
iset = freq_itemsets_sort(df, max=False, sort_by='length', min_support=support, use_colnames=True)

In [8]:
iset.head()

Unnamed: 0,support,itemsets,length
3223,0.206767,"(QE_I01_A, QE_I18_A, TP_SEXO_F, QE_I17_B, QE_I02_A, QE_I10_A, QE_I09_B, QE_I19_B, QE_I12_A, QE_I21_A)",10
3207,0.206767,"(QE_I01_A, QE_I18_A, TP_SEXO_F, QE_I17_B, QE_I02_A, QE_I10_A, QE_I09_B, QE_I19_B, QE_I21_A, QE_I11_A)",10
3175,0.214286,"(QE_I01_A, QE_I18_A, TP_SEXO_F, QE_I17_B, QE_I02_A, QE_I10_A, QE_I09_B, QE_I19_B, QE_I12_A, QE_I11_A)",10
984,0.214286,"(QE_I01_A, QE_I18_A, TP_SEXO_F, QE_I02_A, QE_I10_A, QE_I09_B, QE_I19_B, QE_I12_A, QE_I21_A, QE_I11_A)",10
3235,0.203008,"(QE_I01_A, QE_I18_A, TP_SEXO_F, QE_I17_B, QE_I02_A, QE_I10_A, QE_I09_B, QE_I12_A, QE_I21_A, QE_I11_A)",10


In [9]:
iset.tail()

Unnamed: 0,support,itemsets,length
18,0.353383,(QE_I04_D),1
17,0.43609,(QE_I24_A),1
11,0.221805,(QE_I07_A),1
7,0.605263,(QE_I11_A),1
0,0.913534,(QE_I18_A),1


In [10]:
query = {"QE_I14_A"}
find_itemsets(iset, query, False)

Unnamed: 0,support,itemsets,length


### Discover association rules

In [11]:
use_metric='lift'
threshold=1.2

In [12]:
rules = association_rules(iset, metric=use_metric, min_threshold=threshold)

#### Summary of the metrics of interest

In [13]:
summary = rules.loc[:, ['support', 'confidence', 'lift', 'conviction']].describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
support,205360.0,0.245412,0.041169,0.203008,0.214286,0.233083,0.266917,0.575188
confidence,205360.0,0.636916,0.164525,0.245455,0.509934,0.633663,0.763889,1.0
lift,205360.0,1.575604,0.243778,1.2,1.376362,1.543479,1.745909,2.530655
conviction,205360.0,inf,,1.056255,1.337429,1.633727,2.222959,inf


Get latex code for the table.

In [14]:
print(summary.drop(columns='count').to_latex(float_format="%.2f", decimal=","))

\begin{tabular}{lrrrrrrr}
\toprule
{} &  mean &  std &  min &  25\% &  50\% &  75\% &  max \\
\midrule
support    &  0,25 & 0,04 & 0,20 & 0,21 & 0,23 & 0,27 & 0,58 \\
confidence &  0,64 & 0,16 & 0,25 & 0,51 & 0,63 & 0,76 & 1,00 \\
lift       &  1,58 & 0,24 & 1,20 & 1,38 & 1,54 & 1,75 & 2,53 \\
conviction &   inf &  NaN & 1,06 & 1,34 & 1,63 & 2,22 &  inf \\
\bottomrule
\end{tabular}



Extend dataframe to store itemsets length.

In [15]:
rules['length ant.'] = rules['antecedents'].apply(lambda x: len(x))
rules['length cons.'] = rules['consequents'].apply(lambda x: len(x))

#### Interesting rules according to max. support

In [16]:
rules.sort_values(by=[ 'support', 'confidence'], ascending=False).query('`length ant.` > 2 | `length cons.` > 2').head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
201985,"(QE_I01_A, QE_I18_A, QE_I09_B)",(QE_I10_A),0.541353,0.695489,0.507519,0.9375,1.347973,0.131014,4.87218,3,1
201984,"(QE_I01_A, QE_I18_A, QE_I10_A)",(QE_I09_B),0.605263,0.616541,0.507519,0.838509,1.360021,0.134349,2.374494,3,1
201991,(QE_I09_B),"(QE_I01_A, QE_I18_A, QE_I10_A)",0.616541,0.605263,0.507519,0.823171,1.360021,0.134349,2.232305,1,3
201990,(QE_I10_A),"(QE_I01_A, QE_I18_A, QE_I09_B)",0.695489,0.541353,0.507519,0.72973,1.347973,0.131014,1.696992,1,3
202441,(QE_I11_A),"(QE_I19_B, QE_I01_A, QE_I18_A)",0.605263,0.680451,0.503759,0.832298,1.223156,0.091907,1.905458,1,3
202434,"(QE_I19_B, QE_I01_A, QE_I18_A)",(QE_I11_A),0.680451,0.605263,0.503759,0.740331,1.223156,0.091907,1.520157,3,1
202445,(QE_I11_A),"(QE_I01_A, QE_I18_A, QE_I21_A)",0.605263,0.650376,0.496241,0.819876,1.260618,0.102592,1.941016,1,3
202442,"(QE_I01_A, QE_I18_A, QE_I21_A)",(QE_I11_A),0.650376,0.605263,0.496241,0.763006,1.260618,0.102592,1.665597,3,1
201952,"(QE_I01_A, QE_I18_A, QE_I09_B)",(QE_I19_B),0.541353,0.744361,0.484962,0.895833,1.203493,0.082,2.454135,3,1
202413,"(QE_I01_A, QE_I18_A, QE_I11_A)",(QE_I10_A),0.567669,0.695489,0.484962,0.854305,1.228352,0.090155,2.090055,3,1


#### Interesting rules according to max. confidence

In [17]:
rules.sort_values(by=[ 'confidence', 'length ant.'], ascending=False).query('`length cons.` > 2').head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
114065,"(QE_I24_E, QE_I12_A, QE_I17_B, QE_I09_B)","(QE_I01_A, QE_I18_A, QE_I10_A)",0.218045,0.605263,0.214286,0.982759,1.623688,0.082311,22.894737,4,3
53916,"(QE_I24_E, QE_I17_B, QE_I09_B, QE_I19_B, QE_I12_A)","(QE_I01_A, QE_I18_A, QE_I10_A)",0.210526,0.605263,0.206767,0.982143,1.622671,0.079343,22.105263,5,3
88171,"(QE_I11_A, TP_SEXO_F, QE_I17_B, QE_I09_B)","(QE_I19_B, QE_I18_A, QE_I10_A)",0.263158,0.548872,0.255639,0.971429,1.769863,0.111199,15.789474,4,3
69942,"(QE_I01_A, TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I11_A)","(QE_I19_B, QE_I18_A, QE_I10_A)",0.255639,0.548872,0.24812,0.970588,1.768332,0.107807,15.338346,5,3
68979,"(TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I12_A, QE_I11_A)","(QE_I19_B, QE_I18_A, QE_I10_A)",0.24812,0.548872,0.240602,0.969697,1.766708,0.104415,14.887218,5,3
53189,"(TP_SEXO_F, QE_I17_B, QE_I02_A, QE_I10_A, QE_I11_A)","(QE_I19_B, QE_I01_A, QE_I18_A)",0.244361,0.680451,0.236842,0.969231,1.424394,0.070566,10.385338,5,3
23161,"(QE_I18_A, TP_SEXO_F, QE_I02_A, QE_I09_B, QE_I12_A, QE_I11_A)","(QE_I19_B, QE_I01_A, QE_I10_A)",0.240602,0.533835,0.233083,0.96875,1.814701,0.104641,14.917293,6,3
34089,"(QE_I01_A, TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I12_A, QE_I11_A)","(QE_I19_B, QE_I18_A, QE_I10_A)",0.240602,0.548872,0.233083,0.96875,1.764983,0.101023,14.43609,6,3
59038,"(TP_SEXO_F, QE_I17_B, QE_I02_A, QE_I10_A, QE_I12_A)","(QE_I19_B, QE_I01_A, QE_I18_A)",0.240602,0.680451,0.233083,0.96875,1.423688,0.069365,10.225564,5,3
70208,"(TP_SEXO_F, QE_I17_B, QE_I09_B, QE_I21_A, QE_I11_A)","(QE_I19_B, QE_I18_A, QE_I10_A)",0.240602,0.548872,0.233083,0.96875,1.764983,0.101023,14.43609,5,3


#### Interesting rules according to max. lift

In [18]:
rules.sort_values(by=[ 'lift', 'confidence'], ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
5293,"(QE_I18_A, TP_SEXO_F, QE_I09_B, QE_I12_A, QE_I21_A, QE_I11_A)","(QE_I19_B, QE_I17_B, QE_I10_A, QE_I02_A)",0.24812,0.323308,0.203008,0.818182,2.530655,0.122788,3.721805,6,4
5828,"(QE_I19_B, QE_I17_B, QE_I10_A, QE_I02_A)","(QE_I18_A, TP_SEXO_F, QE_I09_B, QE_I12_A, QE_I21_A, QE_I11_A)",0.323308,0.24812,0.203008,0.627907,2.530655,0.122788,2.020677,4,6
5485,"(QE_I18_A, TP_SEXO_F, QE_I09_B, QE_I12_A, QE_I11_A)","(QE_I17_B, QE_I02_A, QE_I10_A, QE_I19_B, QE_I21_A)",0.274436,0.293233,0.203008,0.739726,2.522655,0.122534,2.715473,5,5
5636,"(QE_I17_B, QE_I02_A, QE_I10_A, QE_I19_B, QE_I21_A)","(QE_I18_A, TP_SEXO_F, QE_I09_B, QE_I12_A, QE_I11_A)",0.293233,0.274436,0.203008,0.692308,2.522655,0.122534,2.358083,5,5
4126,"(QE_I01_A, QE_I18_A, TP_SEXO_F, QE_I09_B, QE_I12_A, QE_I21_A, QE_I11_A)","(QE_I17_B, QE_I10_A, QE_I02_A)",0.240602,0.334586,0.203008,0.84375,2.52177,0.122506,4.258647,7,3


#### Interesting rules according to max. conviction

In [19]:
rules[~rules['consequents'].apply(lambda x: x.issuperset({"QE_I18_A"}))].sort_values(by=['conviction', 'length ant.'], ascending=[False, True]).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length ant.,length cons.
205316,(QE_I11_B),(QE_I13_A),0.293233,0.590226,0.293233,1.0,1.694268,0.120159,inf,1,1
205318,(QE_I06_D),(QE_I01_A),0.210526,0.827068,0.210526,1.0,1.209091,0.036407,inf,1,1
204572,"(QE_I06_B, QE_I09_B)",(QE_I01_A),0.274436,0.827068,0.274436,1.0,1.209091,0.047459,inf,2,1
204624,"(QE_I06_B, QE_I10_A)",(QE_I01_A),0.31203,0.827068,0.31203,1.0,1.209091,0.05396,inf,2,1
204646,"(QE_I04_E, QE_I02_A)",(QE_I21_A),0.236842,0.800752,0.236842,1.0,1.248826,0.04719,inf,2,1
