In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('groceries.csv')

In [3]:
df.info() #nearly 10,000 transactions/baskets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Columns: 169 entries, frankfurter to bags
dtypes: bool(169)
memory usage: 1.6 MB


In [4]:
#let's look at a couple
df.head()
#observations: notice that each transaction is a row and each item is a column - this is the format that mlxtend likes

Unnamed: 0,frankfurter,sausage,liver loaf,ham,meat,finished products,organic sausage,chicken,turkey,pork,...,candles,light bulbs,sound storage medium,newspapers,photo/film,pot plants,flower soil/fertilizer,flower (seeds),shopping bags,bags
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
#download apriori from mlxtend library
from mlxtend.frequent_patterns import apriori

#fair warning - this was initially very memory-intensive to run (not sure why)
frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True) #use apriori algorithm to obtain frequent itemsets

In [7]:
frequent_itemsets.info() #13000 rows with 2 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13492 entries, 0 to 13491
Data columns (total 2 columns):
support     13492 non-null float64
itemsets    13492 non-null object
dtypes: float64(1), object(1)
memory usage: 210.9+ KB


In [8]:
frequent_itemsets.head()
#observations: what this is saying is that sausage has a support of about 9%, etc.

Unnamed: 0,support,itemsets
0,0.058973,(frankfurter)
1,0.09395,(sausage)
2,0.005084,(liver loaf)
3,0.026029,(ham)
4,0.025826,(meat)


In [10]:
#let's check the end of the frequent_itemsets
frequent_itemsets.tail()
#observations: notice that these itemsets have more items in them

Unnamed: 0,support,itemsets
13487,0.001118,"(whipped/sour cream, root vegetables, yogurt, ..."
13488,0.001322,"(root vegetables, yogurt, tropical fruit, othe..."
13489,0.001017,"(root vegetables, yogurt, oil, tropical fruit,..."
13490,0.001118,"(root vegetables, yogurt, tropical fruit, othe..."
13491,0.001017,"(domestic eggs, yogurt, tropical fruit, other ..."


In [11]:
frequent_itemsets.sort_values(by=['support'], ascending=False).head()
#observations: whole milk has the highest support (i.e., most prevalent itemset out of all the transactions)

Unnamed: 0,support,itemsets
24,0.255516,(whole milk)
22,0.193493,(other vegetables)
53,0.183935,(rolls/buns)
99,0.174377,(soda)
29,0.139502,(yogurt)


In [15]:
#we are typically most interested in itemsets with multiple items in them (so we can find some sort of meaninful relation!)...

frequent_itemsets.iloc[0,1] #observe - this identifies a specific line from our df

frozenset({'frankfurter'})

In [16]:
len(frequent_itemsets.iloc[0,1]) #observe - this identifies the length of that line

1

In [17]:
#create a length feature for our df
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [18]:
#preview our df
frequent_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.058973,(frankfurter),1
1,0.09395,(sausage),1
2,0.005084,(liver loaf),1
3,0.026029,(ham),1
4,0.025826,(meat),1


In [21]:
#now let's filter our frequent_itemsets df to cases where there were multiple items in the itemsets (i.e., len =2) and 
#that were pretty prevalent (i.e., support >= 0.02)

frequent_itemsets[(frequent_itemsets['length'] == 2) & (frequent_itemsets['support'] >= 0.02)]

#observations: we've returned 61 rows - all of relatively prevalent paired itemsets

Unnamed: 0,support,itemsets,length
173,0.020539,"(frankfurter, whole milk)",2
255,0.026945,"(sausage, other vegetables)",2
257,0.029893,"(sausage, whole milk)",2
281,0.030605,"(sausage, rolls/buns)",2
303,0.024301,"(sausage, soda)",2
...,...,...,...
2387,0.024199,"(rolls/buns, bottled water)",2
2388,0.038332,"(soda, rolls/buns)",2
2524,0.021047,"(soda, pastry)",2
2856,0.028978,"(soda, bottled water)",2


In [22]:
#can search/isolate a specific itemset like this
frequent_itemsets[frequent_itemsets['itemsets'] == {'pastry', 'soda'}]

Unnamed: 0,support,itemsets,length
2524,0.021047,"(soda, pastry)",2


In [25]:
#now let's find the association rules
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, min_threshold=0.1) #note: here we've set the threshold to 0.1 (see notes for sig)

In [26]:
rules.info()

#observations: found 44,000 rules, 9 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43732 entries, 0 to 43731
Data columns (total 9 columns):
antecedents           43732 non-null object
consequents           43732 non-null object
antecedent support    43732 non-null float64
consequent support    43732 non-null float64
support               43732 non-null float64
confidence            43732 non-null float64
lift                  43732 non-null float64
leverage              43732 non-null float64
conviction            43732 non-null float64
dtypes: float64(7), object(2)
memory usage: 3.0+ MB


In [27]:
#let's see these rules
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(frankfurter),(sausage),0.058973,0.09395,0.010066,0.17069,1.81681,0.004526,1.092534
1,(sausage),(frankfurter),0.09395,0.058973,0.010066,0.107143,1.81681,0.004526,1.05395
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.07667
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652


In [28]:
#now it's just a matter of digging through these rules and finding something of interest!

rules['antecedent_len'] = rules['antecedents'].apply(lambda x: len(x)) #re-adding itemset length

In [29]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(frankfurter),(sausage),0.058973,0.09395,0.010066,0.17069,1.81681,0.004526,1.092534,1
1,(sausage),(frankfurter),0.09395,0.058973,0.010066,0.107143,1.81681,0.004526,1.05395,1
2,(meat),(frankfurter),0.025826,0.058973,0.003254,0.125984,2.136302,0.001731,1.07667,1
3,(pork),(frankfurter),0.057651,0.058973,0.005897,0.102293,1.734568,0.002497,1.048256,1
4,(hamburger meat),(frankfurter),0.033249,0.058973,0.003355,0.100917,1.711246,0.001395,1.046652,1


In [31]:
#show me all the rules that have at least 2 items in the itemset, a confidence of at least 75%, and a lift of at least 1.2
rules[(rules['antecedent_len'] >= 2) &
     (rules['confidence'] >= 0.75) &
     (rules['lift'] >= 1.2)].sort_values(by=['support'], ascending=False)

#observations: Has returned 783 rules that satisfy in our criteria above

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
24174,"(citrus fruit, root vegetables, tropical fruit)",(other vegetables),0.005694,0.193493,0.004474,0.785714,4.060694,0.003372,3.763701,3
27997,"(curd, yogurt, tropical fruit)",(whole milk),0.005287,0.255516,0.003965,0.750000,2.935237,0.002614,2.977936,3
30692,"(brown bread, other vegetables, root vegetables)",(whole milk),0.004067,0.255516,0.003152,0.775000,3.033078,0.002113,3.308818,3
38975,"(whole milk, citrus fruit, root vegetables, tr...",(other vegetables),0.003559,0.193493,0.003152,0.885714,4.577509,0.002463,7.056940,4
12017,"(butter, onions)",(whole milk),0.004067,0.255516,0.003050,0.750000,2.935237,0.002011,2.977936,2
...,...,...,...,...,...,...,...,...,...,...
34389,"(other vegetables, long life bakery product, s...",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3
34167,"(other vegetables, chocolate, margarine)",(whole milk),0.001322,0.255516,0.001017,0.769231,3.010499,0.000679,3.226097,3
33592,"(other vegetables, whipped/sour cream, detergent)",(whole milk),0.001220,0.255516,0.001017,0.833333,3.261374,0.000705,4.466904,3
32853,"(herbs, whole milk, fruit/vegetable juice)",(other vegetables),0.001118,0.193493,0.001017,0.909091,4.698323,0.000800,8.871581,3


In [32]:
#show me all the rules that contain yogurt
rules[rules['antecedents'].apply(lambda x: 'yogurt' in x)]

#observations: returns 7322 rules that include yogurt in their itemset

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
78,(yogurt),(sausage),0.139502,0.093950,0.019624,0.140671,1.497289,0.006518,1.054368,1
393,(yogurt),(citrus fruit),0.139502,0.082766,0.021657,0.155248,1.875752,0.010111,1.085803,1
486,(yogurt),(tropical fruit),0.139502,0.104931,0.029283,0.209913,2.000475,0.014645,1.132873,1
596,(yogurt),(pip fruit),0.139502,0.075648,0.017997,0.129009,1.705378,0.007444,1.061264,1
711,(yogurt),(root vegetables),0.139502,0.108998,0.025826,0.185131,1.698475,0.010621,1.093429,1
...,...,...,...,...,...,...,...,...,...,...
43718,"(yogurt, domestic eggs, butter)","(other vegetables, whole milk, tropical fruit)",0.002949,0.017082,0.001017,0.344828,20.186782,0.000966,1.500243,3
43719,"(yogurt, domestic eggs, whole milk)","(other vegetables, butter, tropical fruit)",0.007728,0.005491,0.001017,0.131579,23.964425,0.000974,1.145193,3
43725,"(yogurt, butter, tropical fruit)","(other vegetables, domestic eggs, whole milk)",0.004575,0.012303,0.001017,0.222222,18.062443,0.000960,1.269896,3
43726,"(other vegetables, yogurt, butter)","(domestic eggs, whole milk, tropical fruit)",0.006406,0.006914,0.001017,0.158730,22.957516,0.000972,1.180461,3
