In [26]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

  and should_run_async(code)




# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [28]:
# load the data set ans show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [29]:
products = set()
for col in df.columns:
    products.update(df[col].unique())
products.remove(np.nan)
products

  and should_run_async(code)


{'Bagel',
 'Bread',
 'Cheese',
 'Diaper',
 'Eggs',
 'Meat',
 'Milk',
 'Pencil',
 'Wine'}

## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [30]:
#create an itemset based on the products
itemset = set(products)
# encoding the feature
encoded_dict = {}
for item in itemset:
    # Check if item is NaN
    if pd.isnull(item):
        encoded_dict['nan'] = 0  # Special handling for NaN
    else:
        encoded_dict[item] = 1 if item in ['Wine', 'Cheese', 'Diaper', 'Meat', 'Eggs', 'Bread', 'Pencil'] else 0

print(encoded_dict)

{'Milk': 0, 'Cheese': 1, 'Meat': 1, 'Eggs': 1, 'Wine': 1, 'Bagel': 0, 'Bread': 1, 'Pencil': 1, 'Diaper': 1}


  and should_run_async(code)


In [34]:
# Replace NaN with a specific value, e.g., 'No Product'
df_with_nan = df.fillna('NaN')

# Flatten the data again, but keep 'No Product' as a category
flattened_data = df_with_nan.values.flatten()

# Step 1: Use OneHotEncoder, including 'No Product' (which represents missing values)
# Replace 'sparse' with 'sparse_output'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Make sure unknown values are ignored
encoded_data = encoder.fit_transform(flattened_data.reshape(-1, 1))

# Step 2: Create the one-hot encoded DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoder.categories_[0])

# Step 3: Create a DataFrame with 0s and 1s, representing whether the item is bought
product_data = pd.DataFrame(0, index=df.index, columns=encoder.categories_[0])

# Loop through each row and mark presence of products, including 'No Product'
for i, row in df_with_nan.iterrows():
    for product in row:
        product_data.loc[i, product] = 1

# Display DataFrame
product_data.head()

  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,NaN,Pencil,Wine
0,0,1,1,1,1,1,0,0,1,1
1,0,1,1,1,0,1,1,0,1,1
2,0,0,1,0,1,1,1,1,0,1
3,0,0,1,0,1,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1,1


In [35]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
if 'NaN' in product_data.columns:
    product_data.drop(columns=['NaN'], inplace=True)

product_data.head()


  and should_run_async(code)


Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [38]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
apriori(product_data, min_support=0.2, use_colnames=True)

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bagel, Bread)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [39]:
association_rules(apriori(product_data, min_support=0.2, use_colnames=True), metric="confidence", min_threshold=0.6)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
2,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
3,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
8,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717
9,"(Meat, Cheese)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714,0.507042


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

1. Antecedent Support
 Mengukur seberapa sering produk pada bagian awal (antecedent) dari aturan muncul dalam transaksi.Pada baris pertama, antecedent (Bagel) memiliki support 0.425397, artinya Bagel muncul dalam 42.54% dari semua transaksi.

- Dari case di atas:
 Pada baris pertama support antecedent untuk Bagel adalah 0.425397, yang artinya Bagel muncul di sekitar 42.54% dari semua transaksi.


2. Consequent Support
Mengukur seberapa sering produk yang ada di consequent (bagian "maka") muncul dalam transaksi.

- Dari case di atas:
Support consequent untuk Bread adalah 0.504762, yang artinya Bread muncul di sekitar 50.48% dari semua transaksi.


3. Support
Support mengukur frekuensi sebuah aturan dalam dataset, yaitu seberapa sering item muncul bersama-sama. Ini menunjukkan proporsi transaksi di mana baik antecedent (A) maupun consequent (B) muncul, sehingga membantu menemukan pola umum.

- Dari case di atas:
support untuk aturan {Bagel -> Bread} adalah 0.279365. Ini berarti bahwa 27.94% dari semua transaksi mengandung Bagel dan Bread dibeli bersama.


4. Confidence
Confidence mengukur seberapa besar kemungkinan consequent (B) muncul ketika antecedent (A) ada. Confidence menghitung probabilitas bahwa transaksi yang mengandung A juga akan mengandung B.

- Dari case di atas:
baris pertama, confidence untuk aturan {Bagel -> Bread} adalah 0.656716. Ini berarti, jika seseorang membeli Bagel, ada kemungkinan sekitar 65.67% bahwa mereka juga akan membeli Bread.


5. Lift
Mengevaluasi kekuatan suatu asosiasi dengan membandingkan confidence dari aturan tersebut dengan confidence yang diharapkan jika item-item tersebut independen. Nilai lift yang lebih besar dari 1 menunjukkan adanya korelasi positif, yang berarti kejadian A meningkatkan kemungkinan terjadinya B.

- Dari case di atas:
lift untuk aturan {Bagel -> Bread} adalah 1.301042. Ini berarti bahwa membeli Bagel meningkatkan kemungkinan membeli Bread sekitar 30% dibandingkan pembelian Bread secara acak.


6. Leverage
Leverage mengukur perbedaan antara support yang diobservasi untuk antecedent dan consequent terjadi bersama-sama dengan support yang diharapkan jika mereka independen. Dengan kata lain, menunjukkan apakah produk dibeli bersama-sama lebih sering (atau lebih jarang) daripada yang diharapkan.

- Dari case di atas:
leverage untuk aturan {Bagel -> Bread} adalah 0.064641. Ini berarti Bagel dan Bread dibeli bersama sedikit lebih sering daripada yang diharapkan


7. Conviction
Conviction mengukur seberapa sering aturan tersebut membuat prediksi yang benar dibandingkan dengan kasus-kasus di mana aturan tersebut salah. Conviction mempertimbangkan rasio kejadian A tanpa B yang diharapkan.
- Conviction > 1: Semakin besar conviction, semakin jarang aturan dilanggar (lebih dapat diandalkan).
- Conviction = 1: Tidak ada kekuatan aturan, karena antecedent dan consequent independen.

- Dari case di atas:
Pada baris pertama conviction untuk aturan {Bagel -> Bread} adalah 1.442650. Ini menunjukkan bahwa aturan tersebut cukup andal, meskipun tidak sempurna (karena ada beberapa pelanggaran di mana Bagel dibeli tanpa Bread).


Referensi:
- https://herovired.com/learning-hub/topics/association-rules-in-data-mining/
- https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
- https://jurnal.amikom.ac.id/index.php/infos/article/download/561/235
