In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

! pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 86 kB/s eta 0:00:012
Collecting scikit-learn>=1.0.2
  Downloading scikit_learn-1.1.3-cp39-cp39-macosx_10_9_x86_64.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 146 kB/s eta 0:00:01
Installing collected packages: scikit-learn, mlxtend
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed mlxtend-0.21.0 scikit-learn-1.1.3


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [2]:
# load the data set ans show the first five transaction

df = pd.read_csv("https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv")
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [8]:
items = set()
for col in df:
    if df[col].unique != " " :
        items.update(df[col].unique())
print(items)

{nan, 'Meat', 'Wine', 'Diaper', 'Eggs', 'Milk', 'Cheese', 'Bread', 'Bagel', 'Pencil'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [9]:
#create an itemset based on the products
itemset = set(items)

# encoding the feature
encoded = []
for index, row in df.iterrows():
    rowset = set(row)
    labels = {}
    uncommons = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded.append(labels)
encoded[0]

{nan: 0,
 'Milk': 0,
 'Bagel': 0,
 'Meat': 1,
 'Wine': 1,
 'Diaper': 1,
 'Eggs': 1,
 'Cheese': 1,
 'Bread': 1,
 'Pencil': 1}

In [10]:
  # create new dataframe from the encoded features
encoded_df = pd.DataFrame(encoded)
  # show the new dataframe
encoded_df.head()

Unnamed: 0,NaN,Milk,Bagel,Meat,Wine,Diaper,Eggs,Cheese,Bread,Pencil
0,0,0,0,1,1,1,1,1,1,1
1,0,1,0,1,1,1,0,1,1,1
2,1,1,0,1,1,0,1,1,0,0
3,1,1,0,1,1,0,1,1,0,0
4,1,0,0,1,1,0,0,0,0,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [13]:
encoded_df.drop(encoded_df.columns[0], axis=1, inplace=True)
encoded_df.head()

Unnamed: 0,Milk,Bagel,Meat,Wine,Diaper,Eggs,Cheese,Bread,Pencil
0,0,0,1,1,1,1,1,1,1
1,1,0,1,1,1,0,1,1,1
2,1,0,1,1,0,1,1,0,0
3,1,0,1,1,0,1,1,0,0
4,0,0,1,1,0,0,0,0,1


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [15]:
from mlxtend.frequent_patterns import apriori, association_rules

freq_product = apriori(encoded_df, min_support = 0.2, use_colnames = True, verbose = 1)
freq_product

Processing 72 combinations | Sampling itemset size 2Processing 156 combinations | Sampling itemset size 3




Unnamed: 0,support,itemsets
0,0.501587,(Milk)
1,0.425397,(Bagel)
2,0.47619,(Meat)
3,0.438095,(Wine)
4,0.406349,(Diaper)
5,0.438095,(Eggs)
6,0.501587,(Cheese)
7,0.504762,(Bread)
8,0.361905,(Pencil)
9,0.225397,"(Milk, Bagel)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [16]:
rules = association_rules(freq_product, metric='confidence', min_threshold = 0.6)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
2,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
3,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
4,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754


In [17]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
2,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
3,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
4,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
5,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
8,"(Milk, Meat)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137
9,"(Milk, Cheese)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429
