### PART A: Frequent Itemset Generation

In [19]:
# Importing the libraries and the dataset

import numpy as np 
import pandas as pd

Market_Data = pd.read_csv('Market_Basket_Optimisation.csv',index_col=None, header = None ) # Use your local path here

Market_Data.head(10)
Market_Data.shape
7501*0.014

105.014

In [2]:
# Converting the Market Dataset into a nested list
dataSet = []

for index, transaction in Market_Data.iterrows():
    cleaned_transaction = transaction[~transaction.isnull()].tolist()
    dataSet.append(cleaned_transaction)

#### 1. The createItem function

In [3]:
# For the given dataset writing a function to return the list of distinct items in the dataset

def createItem(dataSet):
    
    itemList = []
    
    for transaction in dataSet:
        for item in transaction:
            if not [item] in itemList:
                
                 # creating unique single lists in Itemlist. ie list of all items
                itemList.append([item])
                
    itemList.sort()
    
    return list(map(frozenset, itemList))

#### 2. The scanData function

In [4]:
# Returns an Itemset and dictionary with Support values

def scanData(data, itemsetk, minSupport):
    
    tempDict = {}
    
    for transaction in data:
        for item in itemsetk:
            if item.issubset(transaction):
                if not item in tempDict: 
                    tempDict[item]=1 # tempDict contains number of all items
                else: 
                    tempDict[item] += 1
    
    numItems = float(len(data))
    freqItemset = []
    supportDict = {}
    
    for key in tempDict:
        support = tempDict[key]/numItems 
        
        if support >= minSupport:
            freqItemset.insert(0,key) # freqItemset contains all frequent items
        supportDict[key] = support # contains support of all items
    
    return freqItemset, supportDict

#### 3. The itemSetGenerator function

In [5]:
# Creating Higher order Itemsets

def itemSetGenerator(itemsetk, k):
 
    higherOrderitemset = []
    lenitemsetk = len(itemsetk)
    
    
    for i in range(lenitemsetk):
        for j in range(i+1, lenitemsetk): 
            L1 = list(itemsetk[i])[:k-2] 
            L2 = list(itemsetk[j])[:k-2]
            L1.sort() 
            L2.sort()
            
            # Two frequent itemsets of order k are merged only if their k-1 itemsets are identical
            if L1 == L2:
                higherOrderitemset.append(itemsetk[i] | itemsetk[j]) # Performing set union creates itemset with n+1 items
               
    return higherOrderitemset

#### 4. Frequent Itemsets Generation

In [6]:
def apriori(dataSet, minSupport):
    
    itemList = createItem(dataSet) # Creating frozenset of items
    
    
    # Generating all the frequent 1-itemsets and the support of those items
    freqItemset1, supportDict = scanData(dataSet, itemList, minSupport)
    freqItemsets = [freqItemset1]
    k = 2 
    
    while (len(freqItemsets[k-2]) > 0): # Incrementing k until we no longer find any kth order itemsets
        
        itemsetk = itemSetGenerator(freqItemsets[k-2], k) # Generating itemsets of order k
        
        # Generating the frequent itemset for the kth order and support for each of these itemsets
        freqItemsetk, supportDictk = scanData(dataSet, itemsetk, minSupport) 
        
        supportDict.update(supportDictk)
        freqItemsets.append(freqItemsetk)
        
        k += 1
    return freqItemsets, supportDict

Calculate the maximum possible two itemsets.
- 140
- 120
- 7140
- 5240

In [13]:
# Identify the distinct items involved in the dataset

itemList = createItem(dataSet)
freqItemset1, supportDict = scanData(dataSet, itemList, minSupport = 0.05)

freqItemsets = [freqItemset1]
itemsetk = itemSetGenerator(freqItemsets[k-2], k)
#itemsetk
freqItemsetk, supportDictk = scanData(dataSet, itemsetk, minSupport = 0)
supportDict.update(supportDictk)
freqItemsets.append(freqItemsetk)
supportDict

{frozenset({'almonds'}): 0.020397280362618318,
 frozenset({'antioxydant juice'}): 0.008932142381015865,
 frozenset({'avocado'}): 0.03332888948140248,
 frozenset({'cottage cheese'}): 0.03186241834422077,
 frozenset({'energy drink'}): 0.026663111585121985,
 frozenset({'frozen smoothie'}): 0.06332489001466471,
 frozenset({'green grapes'}): 0.009065457938941474,
 frozenset({'green tea'}): 0.13211571790427942,
 frozenset({'honey'}): 0.047460338621517134,
 frozenset({'low fat yogurt'}): 0.07652313024930009,
 frozenset({'mineral water'}): 0.23836821757099053,
 frozenset({'olive oil'}): 0.0658578856152513,
 frozenset({'salad'}): 0.004932675643247567,
 frozenset({'salmon'}): 0.04252766297826956,
 frozenset({'shrimp'}): 0.07145713904812692,
 frozenset({'spinach'}): 0.007065724570057326,
 frozenset({'tomato juice'}): 0.030395947207039063,
 frozenset({'vegetables mix'}): 0.025729902679642713,
 frozenset({'whole weat flour'}): 0.009332089054792695,
 frozenset({'yams'}): 0.011465137981602452,
 froze

In [17]:
# Calculate the number of possible two itemsets
k = 3
itemsetk = itemSetGenerator(freqItemsets[k-2], k)
freqItemsetk, supportDictk = scanData(dataSet, itemsetk, minSupport = 0)


In [None]:
supportDictk

 Identify the support of itemset {'eggs','mineral water','spaghetti'} [ Round the value upto three decimal points]

- 0.022
- 0.034
- 0.014
- 0.052

Identify the number of transaction where items eggs, mineral water and spaghetti are bought together

- 165
- 255
- 392
- 107

In [11]:
# Find the support of the itemset {'eggs', 'mineral water', 'spaghetti'}



In [42]:
# Identify the total number of Transactions in the dataset



Identify the number of frequent itemsets involving both chocolate and mineral water if minSupport is defined as 0.001

- 244
- 502
- 343
- 156

In [20]:
# Generate the frequent itemsets when minSupport = 0.001

freqItemsets, supportDict = apriori(dataSet, minSupport = 0.001)



In [40]:
# Write a code to find the number of frequent itemset containing both chocolate and mineral water
#len(freqItemsets[1]) #20162
#frozenset({'body spray', 'honey'}) in freqItemsets[1]
#frozenset({'low fat yogurt', 'milk'}).issubset(freqItemsets[2][0])
#freqItemsets[3]
#freqItemsets[5]
#range(2, 5)


s = 1
for i in range(2, 5):
    for item in freqItemsets[i]: 
        if frozenset({'mineral water', 'chocolate'}).issubset(item):
            s += 1

s

244

### PART B: Rule Generation

#### 5. The ‘calcConf’ function

In [57]:
def calcConf(freqSet, H, supportDict, bigRuleList, minConf, minLift):

    prunedH = []
    
    for conseq in H:
        
        conf = supportDict[freqSet]/supportDict[freqSet - conseq] # calculate confidence
        lift = conf / supportDict[conseq]
        if conf >= minConf and lift > minLift:
            bigRuleList.append((freqSet-conseq, conseq, conf))
            print(freqSet-conseq, '--->', conseq, 'confidence = ', conf, 'lift = ', lift)
            prunedH.append(conseq)
#             
    return prunedH

#### 6. The rulesFromConseq function


In [54]:
def rulesFromConseq(freqSet, H, supportDict, bigRuleList, minConf, minLift):

    m = len(H[0]) # Order of the consequent while generating the rules
         
    H = calcConf(freqSet, H, supportDict, bigRuleList, minConf, minLift)
    if len(H)>1: # For len(H)<=1, you cannot generate higher order cadnidates
        
        # creating higher order candidates
        Hmp1 = itemSetGenerator(H, m+1) 
        
        if Hmp1 == []: # This will happen if higher order consequent itemsets are not possible
            # Hmp1 will be an empty list if the itemsets in H don't satisfy the condition for merging
            return 0
        
        if (len(Hmp1[0]) < len(freqSet)):
            # Generate rules while the order of the itemsets in Hmp1 is less than the number of items in the frequent itemset
            rulesFromConseq(freqSet, Hmp1, supportDict, bigRuleList, minConf, minLift)

#### 7. The generateRules function


In [55]:
def generateRules(freqItemsets, supportDict, minConf, minLift):  #supportDict is a dictionary coming from scanData
    bigRuleList = []
    for i in range(1, len(freqItemsets)): # Only get the sets with two or more items
        for freqSet in freqItemsets[i]:
            H1 = [frozenset([item]) for item in freqSet]  
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportDict, bigRuleList, minConf, minLift)
            else:
                calcConf(freqSet, H1, supportDict, bigRuleList, minConf, minLift)
    return bigRuleList 

In order to  identify interesting rules from the dataset, you use constraints on metrics such as support, confidence, lift. Implement the measure lift and find out the rules with minSupport=0.05, minConf = 0.2 and lift > 1.2

Ref Link - https://en.wikipedia.org/wiki/Lift_(data_mining)

- 4
- 6
- 18
- 0

In [59]:
def frquentItemsetGeneration(dataSet, minSupport):

    itemset1 = createItem(dataSet) # Creating frozenset of items
    
    
    # Generating all the frequent 1-itemsets and the support of those items
    freqItemset1, supportDict = scanData(dataSet, itemset1, minSupport)
    
    freqItemsets = [freqItemset1]
    k = 2 
    
    while (len(freqItemsets[k-2]) > 0): # Incrementing k until we no longer find any kth order itemsets
        
        itemsetk = itemSetGenerator(freqItemsets[k-2], k) # Generating itemsets of order k
        
        # Generating the frequent itemset for the kth order and support for each of these itemsets
        freqItemsetk, supportDictk = scanData(dataSet, itemsetk, minSupport) 
        
        supportDict.update(supportDictk)
        freqItemsets.append(freqItemsetk)
        
        k += 1
    return freqItemsets, supportDict

In [67]:
freqItemsets, supportDict = frquentItemsetGeneration(dataSet, minSupport = 0.01)

In [68]:
# In the code for generating rules, implement the computation of Lift
final_rules = generateRules(freqItemsets, supportDict, 0.01, -9999)


frozenset({'milk'}) ---> frozenset({'chicken'}) confidence =  0.11419753086419754 lift =  1.9035459533607684
frozenset({'chicken'}) ---> frozenset({'milk'}) confidence =  0.24666666666666667 lift =  1.9035459533607684
frozenset({'cake'}) ---> frozenset({'pancakes'}) confidence =  0.14638157894736845 lift =  1.5399834834280655
frozenset({'pancakes'}) ---> frozenset({'cake'}) confidence =  0.12482468443197757 lift =  1.5399834834280655
frozenset({'cake'}) ---> frozenset({'frozen vegetables'}) confidence =  0.12664473684210528 lift =  1.3286184210526317
frozenset({'frozen vegetables'}) ---> frozenset({'cake'}) confidence =  0.1076923076923077 lift =  1.3286184210526317
frozenset({'tomatoes'}) ---> frozenset({'green tea'}) confidence =  0.1793372319688109 lift =  1.3574254056488906
frozenset({'green tea'}) ---> frozenset({'tomatoes'}) confidence =  0.09283551967709386 lift =  1.3574254056488908
frozenset({'french fries'}) ---> frozenset({'whole wheat rice'}) confidence =  0.061622464898595

frozenset({'milk'}) ---> frozenset({'spaghetti'}) confidence =  0.27366255144032925 lift =  1.5717785592296398
frozenset({'spaghetti'}) ---> frozenset({'milk'}) confidence =  0.20367534456355285 lift =  1.57177855922964
frozenset({'soup'}) ---> frozenset({'spaghetti'}) confidence =  0.2823218997361478 lift =  1.6215134532318871
frozenset({'spaghetti'}) ---> frozenset({'soup'}) confidence =  0.08192955589586524 lift =  1.6215134532318871
frozenset({'pancakes'}) ---> frozenset({'green tea'}) confidence =  0.17251051893408134 lift =  1.3057531811549385
frozenset({'green tea'}) ---> frozenset({'pancakes'}) confidence =  0.124117053481332 lift =  1.3057531811549388
frozenset({'mineral water'}) ---> frozenset({'pancakes'}) confidence =  0.14149888143176736 lift =  1.4886158620191963
frozenset({'pancakes'}) ---> frozenset({'mineral water'}) confidence =  0.3548387096774194 lift =  1.4886158620191963
frozenset({'spaghetti'}) ---> frozenset({'pancakes'}) confidence =  0.1447166921898928 lift = 

In [69]:
# Generate the rules when minSupport = 0.05, minConfidence = 0.2 and lift >1.2 

len(final_rules)

432

In [74]:
# count the number of rules
[item for item in final_rules if item[0] == frozenset({'chocolate'}) and item[1] == frozenset({'eggs', 'spaghetti'})]


[(frozenset({'chocolate'}),
  frozenset({'eggs', 'spaghetti'}),
  0.06427990235964198)]

Imagine that for some threshold support value, the itemset  {'eggs','chocolate','spaghetti'} is found to be a frequent itemset. Now for the rule {'chocolate'} --> {'spaghetti', 'eggs'} to be valid what can be the max value of minConfidence? [More than one answer type question]

- 0.08
- 0.04
- 0.06
- 0.01

In [44]:
#Itendify the support of {'eggs', 'mineral water', 'chocolate'}




In [31]:
# What will be the confidence of the rule {'chocolate'} --> {'spaghetti', 'eggs'}


