In [None]:
# The Objective of this assignment is to introduce students to rule mining techniques, 
# particularly focusing on market basket analysis and provide hands on experience.
# Dataset: Use the Online retail dataset to apply the association rules.

In [None]:
# Calculate parameters for all possible combination of items
# Support - Support(itemB) = (Transactions containing itemB)/(Total transactions in dataset)
# Confidence - Confidence(A->B) = (Transactions containing itemA and itemB) / No of A transactions
# Lift - Lift(A->B) = (Confidence(A->B))/Support B
# Lift(A->B) refers to the increase in the ratio of sale of B when A is sold

In [1]:
# Installing apyori package

!pip install apyori

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py): started
  Building wheel for apyori (setup.py): finished with status 'done'
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5975 sha256=e9c0a17b6dbca6cf054ca9a83d68137122127b7cc7d4735fb34c810d7a84583e
  Stored in directory: c:\users\mihir\appdata\local\pip\cache\wheels\84\45\a4\8ade6576f75410d8162c6da1de0aa9df56c16c711acab5a813
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [3]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from apyori import apriori

In [5]:
# Loading the dataset
# The dataset given in assignment had only one column with comma separated values 
# So the text had to be converted into columns based on comma

df = pd.read_csv("Online retail.csv")
df

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7496,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7497,chicken,,,,,,,,,,,,,,,,,,,
7498,escalope,green tea,,,,,,,,,,,,,,,,,,


In [6]:
# Make header as None

df = pd.read_csv("Online retail.csv", header = None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,


In [7]:
# Exploratory analysis

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7501 non-null   object
 1   1       5747 non-null   object
 2   2       4389 non-null   object
 3   3       3345 non-null   object
 4   4       2529 non-null   object
 5   5       1864 non-null   object
 6   6       1369 non-null   object
 7   7       981 non-null    object
 8   8       654 non-null    object
 9   9       395 non-null    object
 10  10      256 non-null    object
 11  11      154 non-null    object
 12  12      87 non-null     object
 13  13      47 non-null     object
 14  14      25 non-null     object
 15  15      8 non-null      object
 16  16      4 non-null      object
 17  17      4 non-null      object
 18  18      3 non-null      object
 19  19      1 non-null      object
dtypes: object(20)
memory usage: 1.1+ MB


In [8]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,7501,5747,4389,3345,2529,1864,1369,981,654,395,256,154,87,47,25,8,4,4,3,1
unique,115,117,115,114,110,106,102,98,88,80,66,50,43,28,19,8,3,3,3,1
top,mineral water,mineral water,mineral water,mineral water,green tea,french fries,green tea,green tea,green tea,green tea,low fat yogurt,green tea,green tea,green tea,magazines,salmon,frozen smoothie,protein bar,spinach,olive oil
freq,577,484,375,201,153,107,96,67,57,31,22,15,8,4,3,1,2,2,1,1


In [9]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [10]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,
7500,eggs,frozen smoothie,yogurt cake,low fat yogurt,,,,,,,,,,,,,,,,


In [11]:
df.shape

(7501, 20)

In [None]:
# Preprocessing

# Apriori algorithm requires dataset as list of lists
# The dataset is a big list and each transaction is an inner list with outer big list
# Here we convert pandas dataframe into list of lists

In [12]:
records = []
for i in range(0,7501):
    records.append([str(df.values[i,j]) for j in range(0,20)])

In [None]:
# Applying Apriori algorithm

# This algorithm is used to speed up the process of lift calculations in large datasets
# Set minimum value for support and confidence i.e 
# finding rules for items that have certain default existence and a minimum value for co-occurrence
# with other items
# Extract all the subsets having higher value of support than minimum threshold
# Select all the rules from subsets with confidence value higher than minimum threshold
# Order the rules by descending order of lift

# first parameter - list of list you want to extract the rules
# second parameter - min support for items with support > min support
# third parameter - min confidence for rules having confidence > confidence threshold
# fourth parameter - min lift specifying min lift value for short listed rules
# fifth parameter - min length i.e min no of items to be there in association rules

In [16]:
association_rules = apriori(records,min_support=0.0045,min_confidence=0.2,min_lift=3,min_length=2)
association_results = list(association_rules)

In [17]:
print(len(association_results))

48


In [18]:
print(association_results[0])

RelationRecord(items=frozenset({'light cream', 'chicken'}), support=0.004532728969470737, ordered_statistics=[OrderedStatistic(items_base=frozenset({'light cream'}), items_add=frozenset({'chicken'}), confidence=0.29059829059829057, lift=4.84395061728395)])


In [None]:
# Support - 35/7500 = 0.0045
# No of transactions - 35 among 7500 transactions
# Confidence level - 0.2905 all transactions containing light cream, 29.05% also contain chicken
# Lift - 4.84 Chicken is 4.84 times more likely to be brought by customers buying light cream

In [None]:
# Calculating support, confidence and lift for each rule

In [21]:
for item in association_results:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])
    
    # second index of the inner list
    print("Support: " + str(item[1]))
    
    # third index of the list located at 0th
    # of the third index of the inner list
    
    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: light cream -> chicken
Support: 0.004532728969470737
Confidence: 0.29059829059829057
Lift: 4.84395061728395
Rule: mushroom cream sauce -> escalope
Support: 0.005732568990801226
Confidence: 0.3006993006993007
Lift: 3.790832696715049
Rule: pasta -> escalope
Support: 0.005865884548726837
Confidence: 0.3728813559322034
Lift: 4.700811850163794
Rule: ground beef -> herb & pepper
Support: 0.015997866951073192
Confidence: 0.3234501347708895
Lift: 3.2919938411349285
Rule: ground beef -> tomato sauce
Support: 0.005332622317024397
Confidence: 0.3773584905660377
Lift: 3.840659481324083
Rule: whole wheat pasta -> olive oil
Support: 0.007998933475536596
Confidence: 0.2714932126696833
Lift: 4.122410097642296
Rule: shrimp -> pasta
Support: 0.005065991201173177
Confidence: 0.3220338983050847
Lift: 4.506672147735896
Rule: light cream -> chicken
Support: 0.004532728969470737
Confidence: 0.29059829059829057
Lift: 4.84395061728395
Rule: shrimp -> chocolate
Support: 0.005332622317024397
Confidence: 0.

In [None]:
# =====================================
# Rule: mushroom cream sauce -> escalope
# Support: 0.005732568990801226
# Confidence: 0.3006993006993007
# Lift: 3.790832696715049
# =====================================

# Mushroom cream sauce and escalope are brought frequently
# Support for mushroom cream sauce is 0.0057
# Confidence is 0.3006 showing all transactions containing mushroom, 36.06% contain escalope mostly
# Lift is 3.79 shows escalope is 3.79 more likely to be brought by customers 
# who buy mushroom cream sauce

In [None]:
# Interview questions

# 1.What is lift and why is it important in Association rules?

# Lift is a measure of the strength of the association between two items, 
# taking into account the frequency of both items in the dataset. 
# It is calculated as the confidence of the association divided by the support of the second item.
# A lift value of 1 indicates that the two items are independent, a value greater than 1 
# indicates a positive association, while a value less than 1 indicates a negative association.

In [None]:
# 2.What is support and Confidence. How do you calculate them?

# In data mining, support refers to the relative frequency of an item set in a dataset. 
# For example, if an itemset occurs in 5% of the transactions in a dataset, it has a support of 5%. 
# Support is often used as a threshold for identifying frequent item sets in a dataset, 
# which can be used to generate association rules. For example, if we set the 
# support threshold to 5%, then any itemset that occurs in more than 5% of the transactions in the 
# dataset will be considered a frequent itemset.
# The support of an itemset is the number of transactions in which the itemset appears, 
# divided by the total number of transactions

# In data mining, confidence is a measure of the reliability or support for a given association rule. 
# It is defined as the proportion of cases in which the association rule holds true, 
# or in other words, the percentage of times that the items in the antecedent 
# (the “if” part of the rule) appear in the same transaction as the items in the consequent 
# (the “then” part of the rule).

# Confidence is a measure of the likelihood that an itemset will appear if another itemset appears.

In [None]:
# 3.What are some limitations or challenges of Association rules mining?

# One of the main challenges of association rule mining is the potential to generate an 
# overwhelming number of rules from a large dataset, which can be costly and complex to analyze. 
# To address this issue, you can use techniques to reduce the search space and filter out 
# irrelevant or redundant rules.

# Some of the main drawbacks of association rule algorithms in e-learning are: 
# the used algorithms have too many parameters for somebody non expert in data mining and the 
# obtained rules are far too many, most of them non-interesting and with low comprehensibility.