# Playground for Association Rule Learning slides

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Example of converting lists into a dataframe

In [2]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df1 = pd.DataFrame(te_ary, columns=te.columns_)
df1

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


# Read in Data

In [3]:
df = pd.read_csv('data/groceries.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Columns: 169 entries, frankfurter to bags
dtypes: bool(169)
memory usage: 1.6 MB


In [4]:
df.head()

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
from mlxtend.frequent_patterns import apriori

%time frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)

Wall time: 18.5 s


In [6]:
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.058973,(frankfurter)
1,0.09395,(sausage)
2,0.005084,(liver loaf)
3,0.026029,(ham)
4,0.025826,(meat)
5,0.006507,(finished products)
6,0.002237,(organic sausage)
7,0.042908,(chicken)
8,0.008134,(turkey)
9,0.057651,(pork)


In [7]:
frequent_itemsets.tail(10)

Unnamed: 0,support,itemsets
13482,0.001118,"(whole milk, other vegetables, tropical fruit,..."
13483,0.001423,"(whole milk, other vegetables, tropical fruit,..."
13484,0.001017,"(whole milk, whipped/sour cream, other vegetab..."
13485,0.001322,"(whole milk, other vegetables, tropical fruit,..."
13486,0.001118,"(whole milk, yogurt, other vegetables, tropica..."
13487,0.001118,"(whole milk, whipped/sour cream, other vegetab..."
13488,0.001322,"(whole milk, other vegetables, tropical fruit,..."
13489,0.001017,"(whole milk, oil, other vegetables, tropical f..."
13490,0.001118,"(whole milk, bottled water, other vegetables, ..."
13491,0.001017,"(whole milk, domestic eggs, yogurt, other vege..."


In [8]:
frequent_itemsets.sort_values(by=['support'], ascending=False).head()

Unnamed: 0,support,itemsets
24,0.255516,(whole milk)
22,0.193493,(other vegetables)
53,0.183935,(rolls/buns)
99,0.174377,(soda)
29,0.139502,(yogurt)


In [9]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [10]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.058973,(frankfurter),1
1,0.09395,(sausage),1
2,0.005084,(liver loaf),1
3,0.026029,(ham),1
4,0.025826,(meat),1


In [11]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.02) ]

Unnamed: 0,support,itemsets,length
173,0.020539,"(whole milk, frankfurter)",2
255,0.026945,"(sausage, other vegetables)",2
257,0.029893,"(whole milk, sausage)",2
281,0.030605,"(rolls/buns, sausage)",2
303,0.024301,"(sausage, soda)",2
551,0.021657,"(other vegetables, pork)",2
553,0.022166,"(whole milk, pork)",2
627,0.021251,"(whole milk, beef)",2
762,0.028876,"(other vegetables, citrus fruit)",2
764,0.030503,"(whole milk, citrus fruit)",2


In [12]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'pastry', 'soda'} ]

Unnamed: 0,support,itemsets,length
2524,0.021047,"(soda, pastry)",2


In [13]:
from mlxtend.frequent_patterns import association_rules
%time rules = association_rules(frequent_itemsets, min_threshold=0.1)

Wall time: 307 ms


In [14]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(sausage),(frankfurter),0.093950,0.058973,0.010066,0.107143,1.816810,0.004526,1.053950,1
1,(frankfurter),(sausage),0.058973,0.093950,0.010066,0.170690,1.816810,0.004526,1.092534,1
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.076670,1
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256,1
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652,1
5,(frankfurter),(citrus fruit),0.058973,0.082766,0.006507,0.110345,1.333220,0.001626,1.031000,1
6,(frankfurter),(tropical fruit),0.058973,0.104931,0.009456,0.160345,1.528092,0.003268,1.065995,1
7,(frankfurter),(pip fruit),0.058973,0.075648,0.007219,0.122414,1.618198,0.002758,1.053289,1
8,(frankfurter),(root vegetables),0.058973,0.108998,0.010168,0.172414,1.581800,0.003740,1.076627,1
9,(onions),(frankfurter),0.031012,0.058973,0.003762,0.121311,2.057066,0.001933,1.070945,1


In [15]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ].sort_values(by="support", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
24173,"(citrus fruit, tropical fruit, root vegetables)",(other vegetables),0.005694,0.193493,0.004474,0.785714,4.060694,0.003372,3.763701,3
27999,"(curd, yogurt, tropical fruit)",(whole milk),0.005287,0.255516,0.003965,0.750000,2.935237,0.002614,2.977936,3
38977,"(whole milk, citrus fruit, tropical fruit, roo...",(other vegetables),0.003559,0.193493,0.003152,0.885714,4.577509,0.002463,7.056940,4
30695,"(brown bread, other vegetables, root vegetables)",(whole milk),0.004067,0.255516,0.003152,0.775000,3.033078,0.002113,3.308818,3
31658,"(butter, yogurt, root vegetables)",(whole milk),0.003864,0.255516,0.003050,0.789474,3.089723,0.002063,3.536299,3
14198,"(butter milk, whipped/sour cream)",(whole milk),0.003864,0.255516,0.002949,0.763158,2.986732,0.001961,3.143377,2
33065,"(curd, domestic eggs, other vegetables)",(whole milk),0.003457,0.255516,0.002847,0.823529,3.223005,0.001964,4.218743,3
26605,"(domestic eggs, tropical fruit, root vegetables)",(whole milk),0.003559,0.255516,0.002745,0.771429,3.019101,0.001836,3.257117,3
20265,"(sausage, tropical fruit, root vegetables)",(whole milk),0.003559,0.255516,0.002745,0.771429,3.019101,0.001836,3.257117,3
31666,"(whipped/sour cream, butter, root vegetables)",(whole milk),0.003457,0.255516,0.002644,0.764706,2.992790,0.001760,3.164057,3


In [16]:
rules[ (rules['antecedents'].apply(lambda x: 'Eggs' in x)) & (rules['antecedent_len'] >=2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
