### Association Rule Mining


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

Online Retail Market Basket Analysis
This dataset contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retailer. The company mainly sells unique all-occasion gifts

In [2]:
import pandas as pd
pd.set_option("max_colwidth", 150)

#f = "https://github.com/cs6220.spring2019/raw/master/data/Online%20Retail.xlsx"
f = "../../Downloads/Online Retail.xlsx"
df = pd.read_excel(f)
basket = (df[df["Country"] == "United Kingdom"]
         .groupby(["InvoiceNo", "Description"])["Quantity"]
         .sum().unstack().reset_index().fillna(0)
         .set_index("InvoiceNo")) # transform transactions into basket or items
basket_sets = basket.applymap(lambda x: True if x >=1 else False) #convert counts to booleans


In [3]:
basket_sets

Description,20713,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536366,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536367,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536368,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
536369,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C581484,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C581490,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C581499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C581568,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
basket_sets.shape

(22040, 4202)

Frequent Itemset Generation with Apriori algorithm

In [5]:
# Top 5 1-itemsets with highest support

itemsets_1 = apriori(basket_sets,use_colnames=True, max_len=2,low_memory=False, min_support=0.025)
itemsets_1['length'] = itemsets_1['itemsets'].apply(lambda x:len(x))
new_df = itemsets_1[(itemsets_1.length == 2)]
sorted_df1 = new_df.sort_values(by="support", ascending=False)
sorted_df1[:5]

Unnamed: 0,support,itemsets,length
132,0.035617,"(JUMBO BAG PINK POLKADOT, JUMBO BAG RED RETROSPOT)",2
130,0.031806,"(ROSES REGENCY TEACUP AND SAUCER , GREEN REGENCY TEACUP AND SAUCER)",2
134,0.03167,"(JUMBO STORAGE BAG SUKI, JUMBO BAG RED RETROSPOT)",2
133,0.029809,"(JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG RED RETROSPOT)",2
135,0.027541,"(LUNCH BAG RED RETROSPOT, LUNCH BAG BLACK SKULL.)",2


In [6]:
# Top 5 2-itemsets with the highest 

itemsets_2 = apriori(basket_sets,use_colnames=True, max_len=3, low_memory=False,min_support=0.016)
itemsets_2['length'] = itemsets_2['itemsets'].apply(lambda x:len(x))
new_df2 = itemsets_2[(itemsets_2.length == 3)]
sorted_df2 = new_df2.sort_values(by='support', ascending=False)
sorted_df2


Unnamed: 0,support,itemsets,length
413,0.022368,"(ROSES REGENCY TEACUP AND SAUCER , GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY TEACUP AND SAUCER)",3
416,0.018376,"(JUMBO BAG PINK POLKADOT, JUMBO STORAGE BAG SUKI, JUMBO BAG RED RETROSPOT)",3
417,0.017423,"(JUMBO STORAGE BAG SUKI, JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG RED RETROSPOT)",3
415,0.016606,"(JUMBO BAG PINK POLKADOT, JUMBO SHOPPER VINTAGE RED PAISLEY, JUMBO BAG RED RETROSPOT)",3
414,0.016289,"(ROSES REGENCY TEACUP AND SAUCER , GREEN REGENCY TEACUP AND SAUCER, REGENCY CAKESTAND 3 TIER)",3


In [7]:
# Highest support value for the 1-itemsets
print("The top highest support value", sorted_df1[:1])

The top highest support value       support                                            itemsets  length
132  0.035617  (JUMBO BAG PINK POLKADOT, JUMBO BAG RED RETROSPOT)       2


In [8]:
# Highest support value for the 2-itemsets
print("The top highest support value", sorted_df2[:1])

The top highest support value       support  \
413  0.022368   

                                                                                                itemsets  \
413  (ROSES REGENCY TEACUP AND SAUCER , GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY TEACUP AND SAUCER)   

     length  
413       3  


Associate Rule Generation

In [9]:
# Top 5 association rules itemset_1
association_rules(itemsets_1, metric="confidence", min_threshold=0.02)[:5]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),0.042196,0.039746,0.025544,0.605376,15.231158,0.023867,2.433341
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),0.039746,0.042196,0.025544,0.642694,15.231158,0.023867,2.680627
2,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),0.042377,0.031897,0.02618,0.617773,19.368019,0.024828,2.532797
3,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.031897,0.042377,0.02618,0.820768,19.368019,0.024828,5.342926
4,(ROSES REGENCY TEACUP AND SAUCER ),(GREEN REGENCY TEACUP AND SAUCER),0.043421,0.042377,0.031806,0.732497,17.285056,0.029966,3.579862


In [10]:
# Top 5 association rules itemset_1
association_rules(itemsets_2, metric="confidence", min_threshold=0.60).sort_values(by='confidence',ascending=False)[:6]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
29,"(ROSES REGENCY TEACUP AND SAUCER , PINK REGENCY TEACUP AND SAUCER)",(GREEN REGENCY TEACUP AND SAUCER),0.024773,0.042377,0.022368,0.90293,21.306837,0.021319,9.865319
30,"(GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY TEACUP AND SAUCER)",(ROSES REGENCY TEACUP AND SAUCER ),0.02618,0.043421,0.022368,0.854419,19.677538,0.021232,6.570786
10,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.031897,0.042377,0.02618,0.820768,19.368019,0.024828,5.342926
34,"(JUMBO BAG PINK POLKADOT, JUMBO SHOPPER VINTAGE RED PAISLEY)",(JUMBO BAG RED RETROSPOT),0.020599,0.087931,0.016606,0.806167,9.168178,0.014795,4.705447
35,"(JUMBO BAG PINK POLKADOT, JUMBO STORAGE BAG SUKI)",(JUMBO BAG RED RETROSPOT),0.022913,0.087931,0.018376,0.80198,9.120559,0.016361,4.605948
33,"(GREEN REGENCY TEACUP AND SAUCER, REGENCY CAKESTAND 3 TIER)",(ROSES REGENCY TEACUP AND SAUCER ),0.020599,0.043421,0.016289,0.790749,18.211187,0.015394,4.57144


In [11]:
# What items make up one of the top association rules? Do you think they are likely to be bought together?
Items that are the same are often purchased in different colors.

SyntaxError: invalid syntax (1953544074.py, line 2)

Association Rule Mining U.S Census Data
This dataset is an extraction from the 1994 U.S Census.

In [13]:
import numpy as np
import pandas as pd
import patsy 

path = "https://raw.githubusercontent.com/cs6220/cs6220.spring2019/master/data/adult/"

names = pd.read_table(path + "adult.names", delimiter= None, header=None)
parse_cols = lambda x: x.str.split(":", expand=True).iloc[:, 0]
columns = np.roll(parse_cols(names.iloc[92:108,0]), shift=-1)
df_adult = pd.read_csv(path + "adult.data", delimiter=None, header=None, index_col=False)
df_adult.columns = columns

In [14]:
df_adult[:10]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,">50K, <=50K."
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


Association Rule Mining

In [43]:
# Transform the raw dataset into a format appropriate for association rule mining by dropping all continous columns 
# and one-hot encoding the remaining columns. The values for each resulting column should be binary, reprsented by a 1 or 0.

df_adult_cont = df_adult.select_dtypes(include='number')
df_adult_cate =df_adult.select_dtypes(exclude='number')
#df_adult_cate
df_adult_encoding = pd.get_dummies(df_adult_cate, columns=['workclass','education','marital-status','occupation','relationship','race','sex','native-country','>50K, <=50K.'])

#df_adult['workclass'].value_counts() # check the counts of workclass values
df_adult_encoding = df_adult_encoding.drop('workclass_ ?', axis=1) # dropping workclass_?
df_adult_encoding


Unnamed: 0,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,education_ 11th,...,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,">50K, <=50K._ <=50K",">50K, <=50K._ >50K"
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
32557,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
32558,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
32559,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [112]:
itemsets_bool = df_adult_encoding.applymap(lambda x: True if x >=1 else False) #boolen values
itemsets_bool

Unnamed: 0,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,education_ 11th,...,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,">50K, <=50K._ <=50K",">50K, <=50K._ >50K"
0,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,False,False,True,False
1,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
2,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
3,False,False,False,True,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
32557,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
32558,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
32559,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False


In [137]:
# Use confidence for the rule interestingness (metric="confidence") and generate rules up to a 
# depth of 3(max_len). Generate rules and flag at least 5 examples that you find interesting. 
# Comment on your findings and share a few notes on what you learned from the rules you are 
# highlighting. Note that you can choose your support and confidence thresholds yourself in 
# this analysis.

itemsets_3 = apriori(itemsets_bool,use_colnames=True, max_len=3,low_memory=False, min_support=0.10) #min_support=0.35
min_3 = display(itemsets_3.loc[(itemsets_3['itemsets'].apply(len) >2)]) # gets a minimum of 3 itemsets
#min_3    
#itemsets_3
new_df3 = association_rules(itemsets_3, metric="confidence", min_threshold=0.1)
#new_df3
sorted_df3 = new_df3.sort_values(by='confidence', ascending=False)
sorted_df3[80:90]

Unnamed: 0,support,itemsets
102,0.101809,"(workclass_ Private, education_ HS-grad, marital-status_ Married-civ-spouse)"
103,0.203065,"(race_ White, education_ HS-grad, workclass_ Private)"
104,0.158441,"(workclass_ Private, education_ HS-grad, sex_ Male)"
105,0.219526,"(workclass_ Private, education_ HS-grad, native-country_ United-States)"
106,0.204570,"(workclass_ Private, education_ HS-grad, >50K, <=50K._ <=50K)"
...,...,...
232,0.580971,"(race_ White, >50K, <=50K._ <=50K, native-country_ United-States)"
233,0.205890,"(race_ White, >50K, <=50K._ >50K, native-country_ United-States)"
234,0.264427,"(sex_ Female, >50K, <=50K._ <=50K, native-country_ United-States)"
235,0.411197,"(>50K, <=50K._ <=50K, sex_ Male, native-country_ United-States)"


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
156,"(>50K, <=50K._ >50K)",(native-country_ United-States),0.24081,0.895857,0.220233,0.914552,1.020868,0.004502,1.218784
656,"(>50K, <=50K._ >50K, marital-status_ Married-civ-spouse)",(native-country_ United-States),0.205522,0.895857,0.187863,0.914077,1.020338,0.003745,1.212043
934,"(>50K, <=50K._ >50K, sex_ Male)",(race_ White),0.204601,0.854274,0.187003,0.91399,1.069903,0.012218,1.694293
906,(relationship_ Own-child),"(>50K, <=50K._ <=50K, native-country_ United-States)",0.155646,0.675624,0.142256,0.91397,1.352779,0.037098,3.770497
938,"(race_ White, >50K, <=50K._ <=50K)",(native-country_ United-States),0.635699,0.895857,0.580971,0.913909,1.02015,0.011476,1.209683
362,"(workclass_ Private, race_ White)",(native-country_ United-States),0.595928,0.895857,0.544455,0.913626,1.019835,0.010589,1.205722
375,"(workclass_ Private, >50K, <=50K._ >50K)",(race_ White),0.152422,0.854274,0.139185,0.913157,1.068929,0.008975,1.678052
628,"(>50K, <=50K._ >50K, marital-status_ Married-civ-spouse)",(race_ White),0.205522,0.854274,0.187494,0.912283,1.067905,0.011922,1.661332
674,"(relationship_ Not-in-family, marital-status_ Never-married)","(>50K, <=50K._ <=50K)",0.144529,0.75919,0.131446,0.909477,1.197957,0.021721,2.660211
754,"(occupation_ Craft-repair, sex_ Male)",(race_ White),0.119069,0.854274,0.108228,0.90895,1.064004,0.00651,1.600514


In [125]:
# Use lift for the rule interestingness (metric="lift") and generate rules up to a depth of 3 (max len=3). Generate rules 
# and flag at least 5 examples that you find interesting. Comment on your findings and share a few notes on what you learned 
# from the rules you are highlighting. Note that you can choose your support and confidence thresholds yourself in this 
# analysis.

new_df4 = association_rules(itemsets_3, metric="lift", min_threshold=0.02)
sorted_df4 = new_df4.sort_values(by='lift', ascending=False)

interestingness7 = new_df4[(new_df4['lift'] >= 1)]
interestingness
#sorted_df4


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(workclass_ Private),(race_ White),0.697030,0.854274,0.595928,0.854952,1.000795,0.000473,1.004681
1,(race_ White),(workclass_ Private),0.854274,0.697030,0.595928,0.697584,1.000795,0.000473,1.001832
6,(workclass_ Private),"(>50K, <=50K._ <=50K)",0.697030,0.759190,0.544609,0.781327,1.029158,0.015430,1.101232
7,"(>50K, <=50K._ <=50K)",(workclass_ Private),0.759190,0.697030,0.544609,0.717354,1.029158,0.015430,1.071907
8,(relationship_ Husband),(marital-status_ Married-civ-spouse),0.405178,0.459937,0.404902,0.999318,2.172729,0.218545,791.672741
...,...,...,...,...,...,...,...,...,...
121,(race_ White),"(>50K, <=50K._ <=50K, sex_ Male)",0.854274,0.464605,0.401861,0.470413,1.012501,0.004961,1.010967
124,"(race_ White, >50K, <=50K._ <=50K)",(native-country_ United-States),0.635699,0.895857,0.580971,0.913909,1.020150,0.011476,1.209683
126,"(>50K, <=50K._ <=50K, native-country_ United-States)",(race_ White),0.675624,0.854274,0.580971,0.859903,1.006589,0.003803,1.040181
127,(race_ White),"(>50K, <=50K._ <=50K, native-country_ United-States)",0.854274,0.675624,0.580971,0.680076,1.006589,0.003803,1.013916


In [103]:
# Compare the top rules using the two interestingness measures for the same levels of support 
# (use at least two different levels of support) and comment on your findings.


0.404871 	(relationship_ Husband, marital-status_ Married-civ-spouse, sex_ Male)
0.410553 	(marital-status_ Married-civ-spouse, native-country_ United-States)

# With 10% support and 94% confidence native white males are in executive positions
race_ White, occupation_ Exec-managerial) 	(native-country_ United-States) 	 0.105771 	0.944597

# With 10% support and 90% confidence race_white are in craft-repair occupations
(occupation_ Craft-repair, sex_ Male) 	(race_ White) 	 0.108228 	0.908950

SyntaxError: invalid syntax (3649040138.py, line 4)