# Question 5: Market Basket Analysis using Association Rule Mining

In [8]:
import pandas as pd

In [9]:
grocery_data_path = '../data/GroceryData/grocery-data.csv'
grocery_data = pd.read_csv(grocery_data_path)

grocery_data['Date'] = pd.to_datetime(grocery_data['Date'], format='%d-%m-%Y')

print(grocery_data.head())
print(f"Transaction Info : {grocery_data.shape}")

   Member_number       Date   itemDescription
0           1808 2015-07-21    tropical fruit
1           2552 2015-01-05        whole milk
2           2300 2015-09-19         pip fruit
3           1187 2015-12-12  other vegetables
4           3037 2015-02-01        whole milk
Transaction Info : (38765, 3)


In [10]:
def individual_itemset(data):
    unique_items = set(data['itemDescription'])
    print(f"Total Unique Items : {len(unique_items)}")
    print(f"Unique Items : {unique_items}")
    return unique_items

unique_items = individual_itemset(grocery_data)

Total Unique Items : 167
Unique Items : {'pudding powder', 'salad dressing', 'detergent', 'ketchup', 'oil', 'citrus fruit', 'dessert', 'meat', 'light bulbs', 'baby cosmetics', 'roll products ', 'cereals', 'make up remover', 'grapes', 'cleaner', 'liver loaf', 'rubbing alcohol', 'newspapers', 'cake bar', 'specialty bar', 'cooking chocolate', 'candles', 'preservation products', 'nut snack', 'cookware', 'pasta', 'whole milk', 'instant coffee', 'rolls/buns', 'chicken', 'berries', 'ham', 'artif. sweetener', 'napkins', 'flower (seeds)', 'abrasive cleaner', 'cream', 'pork', 'condensed milk', 'canned fruit', 'beef', 'frankfurter', 'sweet spreads', 'white bread', 'cream cheese ', 'rice', 'syrup', 'dishes', 'red/blush wine', 'organic sausage', 'frozen chicken', 'domestic eggs', 'kitchen utensil', 'sauces', 'cling film/bags', 'soft cheese', 'specialty vegetables', 'prosecco', 'house keeping products', 'beverages', 'hygiene articles', 'seasonal products', 'candy', 'canned fish', 'softener', 'hair s

In [11]:
df = grocery_data["itemDescription"].value_counts()[:20].reset_index()
df.columns = ["Category", "Count"]
print(df)

              Category  Count
0           whole milk   2502
1     other vegetables   1898
2           rolls/buns   1716
3                 soda   1514
4               yogurt   1334
5      root vegetables   1071
6       tropical fruit   1032
7        bottled water    933
8              sausage    924
9         citrus fruit    812
10              pastry    785
11           pip fruit    744
12       shopping bags    731
13         canned beer    717
14        bottled beer    687
15  whipped/sour cream    662
16          newspapers    596
17         frankfurter    580
18         brown bread    571
19                pork    566


In [12]:
grocery_data["Single_transaction"] = grocery_data["Member_number"].astype(str) + "_" + grocery_data["Date"].astype(str)

print(grocery_data.head())

   Member_number       Date   itemDescription Single_transaction
0           1808 2015-07-21    tropical fruit    1808_2015-07-21
1           2552 2015-01-05        whole milk    2552_2015-01-05
2           2300 2015-09-19         pip fruit    2300_2015-09-19
3           1187 2015-12-12  other vegetables    1187_2015-12-12
4           3037 2015-02-01        whole milk    3037_2015-02-01


In [13]:
transaction_data = grocery_data.groupby("Single_transaction")["itemDescription"].apply(list).tolist()
print(transaction_data[:5])

[['whole milk', 'pastry', 'salty snack'], ['sausage', 'whole milk', 'semi-finished bread', 'yogurt'], ['soda', 'pickled vegetables'], ['canned beer', 'misc. beverages'], ['sausage', 'hygiene articles']]


In [14]:
import numpy as np
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transaction_data).transform(transaction_data)
df = pd.DataFrame(te_ary, columns=te.columns_)
print(df.head())
print(df.sum())

   Instant food products  UHT-milk  abrasive cleaner  artif. sweetener  \
0                  False     False             False             False   
1                  False     False             False             False   
2                  False     False             False             False   
3                  False     False             False             False   
4                  False     False             False             False   

   baby cosmetics   bags  baking powder  bathroom cleaner   beef  berries  \
0           False  False          False             False  False    False   
1           False  False          False             False  False    False   
2           False  False          False             False  False    False   
3           False  False          False             False  False    False   
4           False  False          False             False  False    False   

   ...  turkey  vinegar  waffles  whipped/sour cream  whisky  white bread  \
0  ...   False 

In [15]:
from mlxtend.frequent_patterns import apriori

apriori_data = apriori(df, min_support=0.01, use_colnames=True)
print(apriori_data)

     support                        itemsets
0   0.021386                      (UHT-milk)
1   0.033950                          (beef)
2   0.021787                       (berries)
3   0.016574                     (beverages)
4   0.045312                  (bottled beer)
..       ...                             ...
64  0.010559  (other vegetables, rolls/buns)
65  0.014837  (other vegetables, whole milk)
66  0.013968        (whole milk, rolls/buns)
67  0.011629              (whole milk, soda)
68  0.011161            (yogurt, whole milk)

[69 rows x 2 columns]


In [16]:
from mlxtend.frequent_patterns import association_rules

association_data = association_rules(apriori_data, metric="confidence", min_threshold=0.01)
print(association_data)

          antecedents         consequents  antecedent support  \
0  (other vegetables)        (rolls/buns)            0.122101   
1        (rolls/buns)  (other vegetables)            0.110005   
2  (other vegetables)        (whole milk)            0.122101   
3        (whole milk)  (other vegetables)            0.157923   
4        (whole milk)        (rolls/buns)            0.157923   
5        (rolls/buns)        (whole milk)            0.110005   
6        (whole milk)              (soda)            0.157923   
7              (soda)        (whole milk)            0.097106   
8            (yogurt)        (whole milk)            0.085879   
9        (whole milk)            (yogurt)            0.157923   

   consequent support   support  confidence      lift  representativity  \
0            0.110005  0.010559    0.086481  0.786154               1.0   
1            0.122101  0.010559    0.095990  0.786154               1.0   
2            0.157923  0.014837    0.121511  0.769430      

In [17]:
filtered_data = association_data[(association_data['confidence'] > 0.1) & (association_data['lift'] > 0.8)]
print(filtered_data)

    antecedents   consequents  antecedent support  consequent support  \
5  (rolls/buns)  (whole milk)            0.110005            0.157923   
8      (yogurt)  (whole milk)            0.085879            0.157923   

    support  confidence      lift  representativity  leverage  conviction  \
5  0.013968    0.126974  0.804028               1.0 -0.003404    0.964550   
8  0.011161    0.129961  0.822940               1.0 -0.002401    0.967861   

   zhangs_metric   jaccard  certainty  kulczynski  
5      -0.214986  0.055000  -0.036752    0.107711  
8      -0.190525  0.047975  -0.033206    0.100317  
