# **Market Basket Analysis**

**Market basket analysis is a technique used by businesses to understand customer buying habits. It analyzes past purchase data, specifically focusing on what items are frequently purchased together in a single transaction**

In [None]:
import numpy as np
import pandas as pd
import glob

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('location of the file',usecols=['Customer id','Product name','Product type','Order','Total gross sales','Total orders','Date'])

In [None]:
df = df.dropna(subset=['Product name'])

In [None]:
df = df[df['Total orders']>0]

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
df['Order'].nunique() #Total number of transaction from the full data

778217

In [None]:
df['Product name'].nunique() #Total number of products

2240

In [None]:
df['Product type'].nunique() #Total number of product category

49

In [None]:
import matplotlib.pyplot as plt
top_10_products = df.groupby('Product name')['Order'].nunique().nlargest(10)

plt.figure(figsize=(12, 8))
top_10_products.plot(kind='bar')
plt.xlabel('Product Name')
plt.ylabel('Order count')
plt.title('Top 10 Products by Number of Orders')
plt.show() #Top products by order

In [None]:
transaction_category = df.groupby("Order")['Product type'].unique() #transaction itemsets from orders
transaction_category

In [None]:
transactions = df.groupby("Order")['Product name'].unique() #product name
transactions #Transaction consists of the unique items purchased by a customer

In [None]:
transactions.value_counts()[:50].plot(kind='bar', figsize=(15,5))

In [None]:
import matplotlib.pyplot as plt

# Assuming 'transactions' is your Series
transactions_count = transactions.apply(len).sort_values(ascending=False)  # Count the number of products in each transaction and sort

plt.figure(figsize=(12, 8))
transactions_count.plot(kind='barh')
plt.xlabel('Number of Products')
plt.ylabel('Order')
plt.title('Number of Products per Order')
plt.show()

In [None]:
transactions.value_counts()[:10].plot(kind='bar', figsize=(15,5))

In [None]:
from itertools import permutations
flattened = [i for t in transactions for i in t]
prduct = list(set(flattened))

# Generate all possible rules
rules = list(permutations(product, 2))

rules

In [None]:
len(rules)

In [None]:
transactions.head() #items purchased together by each customer based on their order id

In [None]:
transactions = transactions.tolist() #convert to list

In [None]:
counts = [len(transaction) for transaction in transactions] #number of items in each transaction


In [None]:
np.median(counts) #median number of items in a transaction.

In [None]:
np.max(counts) #maximum number of items in a transaction.

**Association Rule & Metrics**

Association rule: an "if-then" relationship between two itemsets.

Metric: a measure of the strength of association between two itemsets.

In [None]:
from mlxtend.preprocessing import TransactionEncoder

# Instantiate an encoder.
encoder = TransactionEncoder()

# Fit encoder to list of lists.
encoder.fit(transactions)

# Transform lists into one-hot encoded array.
onehot = encoder.transform(transactions)

# Convert array to pandas DataFrame.
onehot = pd.DataFrame(onehot, columns = encoder.columns_)

In [None]:
onehot.head()

**Support Metric :  frequency with which an itemset appears in a database of transactions**

In [None]:
onehot.mean(axis=0) #support metric

In [None]:
onehot.mean(axis=0).sort_values(ascending=False).head(60)

In [None]:
top_20 = onehot_mean.sort_values(ascending=False).head(20)

top_20.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Product')
plt.ylabel('Support Value')
plt.title('Support Metric for top 20')
plt.show() #Top 20 products by support

**Popular products are still only present in 2.5% of transactions**.

In [None]:
onehot['zip trench black & grey'] = onehot['Zip Trench Coat for Tall Men in Black'] & onehot['Zip Trench Coat for Tall Men in Slate Grey']

#support value
onehot['zip trench black & grey'].mean(axis = 0)

**A high support value indicates that items in an itemset are purchased together frequently**

In [None]:
count = onehot.sum(axis=1).value_counts()
count

In [None]:
import matplotlib.pyplot as plt

data = {
    1: 255965, 2: 221971, 3: 126555, 4: 73269, 5: 38985, 6: 23202, 7: 13642, 8: 8373, 9: 5325, 10: 3510,
    11: 2261, 12: 1604, 13: 1130, 14: 738, 15: 506, 16: 355, 17: 211, 18: 167, 19: 138, 20: 90,
    21: 58, 22: 41, 23: 35, 24: 30, 25: 12, 26: 13, 27: 6, 28: 7, 29: 3, 30: 3, 31: 3, 32: 1,
    34: 3, 35: 1, 36: 1, 39: 1, 40: 1
}

bins = list(data.keys())
values = list(data.values())

plt.figure(figsize=(12, 6))
plt.bar(bins, values)
plt.xlabel('Bins')
plt.ylabel('Count')
plt.title('Item Transaction Distribution Count')
plt.show()

**Confidence Metric  : probability of the consequent, given the antecedent.**

In [None]:
# Compute joint support
joint_support = (onehot['Zip Trench Coat for Tall Men in Black'] & onehot['Zip Trench Coat for Tall Men in Slate Grey']).mean()

# Print confidence metric
joint_support / onehot['Zip Trench Coat for Tall Men in Black'].mean()

In [None]:
# Compute joint support
joint_support = (onehot['Zip Trench Coat for Tall Men in Black'] & onehot['Zip Trench Coat for Tall Men in Slate Grey']).mean()

# Print confidence metric
joint_support / onehot['Zip Trench Coat for Tall Men in Slate Grey'].mean()

0.03125

**A customer who buys grey coat can be suggested with a black as grey--black has higher confidence**

In [None]:
#Apriori Algorithm

**Apriori Algorithm:  identifies frequent itemsets using the Apriori principle**

In [None]:
from mlxtend.frequent_patterns import apriori

# Apply apriori algorithm to data with min support threshold of 0.001.
frequent_itemsets = apriori(onehot, min_support = 0.01, use_colnames = True)

# Print frequent itemsets.
frequent_itemsets

**We will apply the apriori algorithm to find the most frequently purchased items in the dataset. For our minimum support value, we will specify 1%, which means only items whose frequency of occurrence in the data is at least 1% will be returned**

In [None]:
from mlxtend.frequent_patterns import apriori

# Apply apriori algorithm to data with min support threshold of 0.005.
frequent_itemsets = apriori(onehot, min_support = 0.005, use_colnames = True)

# Print frequent itemsets.
frequent_itemsets

 **Lowering the support threshold increased the number of itemsets returned and even yielded itemsets with more than one item**

In [None]:
frequent_itemsets.sort_values('support', ascending = False)

**From the results, we can see that the “Wearever Fleece Joggers for Tall Men in Black” is the most frequently purchased item with a support value of 0.027239, meaning it was purchased in 2.7% of all transactions.**

In [None]:
frequent_itemsets = apriori(onehot, min_support = 0.005, use_colnames = True)
frequent_itemsets

**Association Rule from apriori algorithm**

In [None]:
from mlxtend.frequent_patterns import association_rules

# Recover association rules using support and a minimum threshold of 0.0001.
rules = association_rules(frequent_itemsets, metric = 'support', min_threshold = 0.0001)

# Print rules header.
rules

In [None]:
# Recover association rules using confidence threshold of 0.005.
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.005)

# Print rules.
rules

In [None]:
# Recover association rules using confidence threshold of 0.01.
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.01)

# Print rules.
rules

In [None]:
#rules with a consequent support above 0.02.
rules = rules[rules['consequent support'] > 0.02]

rules

In [None]:
# Select rules with leverage higher than 0.0.
rules = rules[rules['leverage'] > 0.0]

rules

In [None]:
assoc_rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1).sort_values("lift",ascending=False).reset_index(drop=True)
assoc_rules

**Based on the results from implementing association rules, we can see that “A.T. Performance Engineered Joggers for Tall Men in Charcoal Mix” and “A.T. Performance Engineered Joggers for Tall Men in Black” have the highest “lift” value, and therefore the highest association of any two products. With a combined support of 0.005114, it means both items were purchased together in 0.5114% of all transactions.**

In [None]:
#Reference Articles

In [None]:
https://select-statistics.co.uk/blog/market-basket-analysis-understanding-customer-behaviour/

In [None]:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [None]:
https://deepnote.com/app/code-along-tutorials/Market-Basket-Analysis-in-Python-An-Implementation-with-Online-Retail-Data-6231620b-cba3-4935-bde8-8ce1490868bf