<a href="https://colab.research.google.com/github/nsubbaian/FrequentistML/blob/master/Project7/FML_Assignment7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Market Basket Assignment**

Select a dataset of interest to you and perform a market basket analysis, including finding frequent itemsets and mining association rules.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dataset from https://www.kaggle.com/irfanasrullah/groceries
groceries = pd.read_csv('https://raw.githubusercontent.com/nsubbaian/FrequentistML/master/Project7/groceries.csv', names=['item' + str(i) for i in range(12)])

# Get a unique list of all the possible grocery items
items = []
for col in groceries.columns: 
  items.extend(groceries[col].unique())
items = set(items)

# Create a mapping of grocery items to an integer ID and vice versa
d = {}
d1= {}
key = 0
for i in items:
  key +=1
  d[i] = key
  d1[key] = i

# Apply mapping to dataset
groceries = groceries.applymap(lambda s: d.get(s) if s in d else s)

# Create list of transactions from dataset
transactions = []
for i in range((groceries.shape[0])):
  transaction = [x for x in list(groceries.iloc[i, :]) if x != 1]
  transactions.append(transaction) 

# print(transactions)
print("Number of Transactions", len(transactions))

Number of Transactions 9836


In [None]:
#---- A Priori algorithm and supporting functions
# https://github.com/pbharrin/machinelearninginaction3x/blob/master/Ch11/apriori.py

def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
                
    C1.sort()
    return list(map(frozenset, C1))#use frozen set so we
                            #can use it as a key in a dict    

def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt: ssCnt[can]=1
                else: ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)
        supportData[key] = support
    return retList, supportData

def aprioriGen(Lk, k): #creates Ck
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

def apriori(dataSet, minSupport):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    bigRuleList = []
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList         

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            # print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
            
def pntRules(ruleList, itemMeaning):
    for ruleTup in ruleList:
        for item in ruleTup[0]:
          antecedent = item
        for item in ruleTup[1]:
          consequent = item
        print("confidence: %f" % ruleTup[2]," ", itemMeaning[antecedent], "-------->", itemMeaning[consequent] )
        print()       #print a blank line

In [None]:
 # Generate a set of frequent itemsets with a specified minSupport
 L,suppData=apriori(transactions, .04)

# L contains some lists of frequent itemsets that meet the specified minSupport
# suppData is a dictionary with the support values of our itemsets
print("Length of L:", len(L))
print("Frequent Itemsets:", L)


 rules=generateRules(L,suppData, minConf=0.1)
print()
 print("Number of rules:", len(rules))
 pntRules(rules, d1)

Length of L: 3
Frequent Itemsets: [[frozenset({158}), frozenset({128}), frozenset({161}), frozenset({159}), frozenset({176}), frozenset({165}), frozenset({29}), frozenset({120}), frozenset({113}), frozenset({55}), frozenset({50}), frozenset({137}), frozenset({36}), frozenset({2}), frozenset({79}), frozenset({69}), frozenset({88}), frozenset({84}), frozenset({147}), frozenset({97}), frozenset({53}), frozenset({152}), frozenset({6}), frozenset({143}), frozenset({71}), frozenset({3}), frozenset({47}), frozenset({75}), frozenset({40}), frozenset({28}), frozenset({156}), frozenset({80})], [frozenset({40, 71}), frozenset({6, 47}), frozenset({47, 55}), frozenset({71, 55}), frozenset({75, 47}), frozenset({6, 71}), frozenset({40, 47}), frozenset({47, 71})], []]

Number of rules: 16
confidence: 0.222806   other vegetables --------> yogurt

confidence: 0.309715   yogurt --------> other vegetables

confidence: 0.217668   whole milk --------> rolls/buns

confidence: 0.304396   rolls/buns --------> 