In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

! pip install mlxtend



In [2]:
from mlxtend.frequent_patterns import apriori, association_rules

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [3]:
# load the data set ans show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,
...,...,...,...,...,...,...,...
310,Bread,Eggs,Cheese,,,,
311,Meat,Milk,Pencil,,,,
312,Bread,Cheese,Eggs,Meat,Pencil,Diaper,Wine
313,Meat,Cheese,,,,,


# Get the set of product that has been purchased


In [4]:
df.isna().sum()

0      0
1     30
2     70
3    128
4    182
5    244
6    274
dtype: int64

## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [5]:
#create an itemset based on the products
df = pd.get_dummies(df, columns = ['0', '1','2','3','4','5','6'])
print(df)
# encoding the feature

     0_Bagel  0_Bread  0_Cheese  0_Diaper  0_Eggs  0_Meat  0_Milk  0_Pencil  \
0          0        1         0         0       0       0       0         0   
1          0        1         0         0       0       0       0         0   
2          0        0         1         0       0       0       0         0   
3          0        0         1         0       0       0       0         0   
4          0        0         0         0       0       1       0         0   
..       ...      ...       ...       ...     ...     ...     ...       ...   
310        0        1         0         0       0       0       0         0   
311        0        0         0         0       0       1       0         0   
312        0        1         0         0       0       0       0         0   
313        0        0         0         0       0       1       0         0   
314        0        0         0         0       1       0       0         0   

     0_Wine  1_Bagel  ...  5_Wine  6_Bagel  6_Bread

In [6]:
df.iloc[:,0:5]

Unnamed: 0,0_Bagel,0_Bread,0_Cheese,0_Diaper,0_Eggs
0,0,1,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,0,0,0
...,...,...,...,...,...
310,0,1,0,0,0
311,0,0,0,0,0
312,0,1,0,0,0
313,0,0,0,0,0


In [7]:
df.sum().sort_values()

5_Bread      1
6_Bagel      1
6_Meat       2
6_Eggs       2
5_Wine       3
            ..
1_Meat      47
1_Bagel     47
2_Eggs      52
0_Cheese    56
0_Bread     74
Length: 63, dtype: int64

In [8]:
  # create new dataframe from the encoded features

  # show the new dataframe


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [9]:
# Building the model
frq_items = apriori(df, min_support = 0.02, use_colnames = True)



The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [10]:
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 0.6)
print(rules)

    antecedents                         consequents  antecedent support  \
0     (1_Bagel)                           (0_Bread)            0.149206   
1     (0_Bread)                           (1_Bagel)            0.234921   
2     (0_Bread)                          (1_Cheese)            0.234921   
3    (1_Cheese)                           (0_Bread)            0.111111   
4    (1_Diaper)                           (0_Bread)            0.095238   
..          ...                                 ...                 ...   
237    (3_Milk)  (4_Wine, 0_Cheese, 2_Eggs, 1_Meat)            0.142857   
238  (0_Cheese)    (3_Milk, 4_Wine, 2_Eggs, 1_Meat)            0.177778   
239    (4_Wine)  (3_Milk, 0_Cheese, 2_Eggs, 1_Meat)            0.114286   
240    (2_Eggs)  (3_Milk, 0_Cheese, 1_Meat, 4_Wine)            0.165079   
241    (1_Meat)  (3_Milk, 0_Cheese, 2_Eggs, 4_Wine)            0.149206   

     consequent support   support  confidence      lift  leverage  conviction  
0              0.23

In [13]:
rules[(rules['lift'] >= 3) & (rules['confidence'] >= 0.3)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(2_Milk),(0_Bread),0.142857,0.234921,0.104762,0.733333,3.121622,0.071202,2.869048
9,(0_Bread),(2_Milk),0.234921,0.142857,0.104762,0.445946,3.121622,0.071202,1.547038
18,(0_Cheese),(1_Meat),0.177778,0.149206,0.098413,0.553571,3.710106,0.071887,1.905778
19,(1_Meat),(0_Cheese),0.149206,0.177778,0.098413,0.659574,3.710106,0.071887,2.415278
20,(0_Cheese),(2_Eggs),0.177778,0.165079,0.101587,0.571429,3.461538,0.072240,1.948148
...,...,...,...,...,...,...,...,...,...
233,"(0_Cheese, 1_Meat)","(3_Milk, 4_Wine, 2_Eggs)",0.098413,0.038095,0.038095,0.387097,10.161290,0.034346,1.569424
234,"(4_Wine, 2_Eggs)","(3_Milk, 0_Cheese, 1_Meat)",0.047619,0.098413,0.038095,0.800000,8.129032,0.033409,4.507937
235,"(4_Wine, 1_Meat)","(3_Milk, 0_Cheese, 2_Eggs)",0.053968,0.101587,0.038095,0.705882,6.948529,0.032613,3.054603
236,"(2_Eggs, 1_Meat)","(3_Milk, 0_Cheese, 4_Wine)",0.098413,0.038095,0.038095,0.387097,10.161290,0.034346,1.569424
