<a href="https://colab.research.google.com/github/nsubbaian/FrequentistML/blob/master/Project7/FML_Assignment7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Market Basket Assignment**

Select a dataset of interest to you and perform a market basket analysis, including finding frequent itemsets and mining association rules.


In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dataset from https://www.kaggle.com/irfanasrullah/groceries
groceries = pd.read_csv('https://raw.githubusercontent.com/nsubbaian/FrequentistML/master/Project7/groceries.csv', names=['item' + str(i) for i in range(12)])

# Get a unique list of all the possible grocery items
items = []
for col in groceries.columns: 
  items.extend(groceries[col].unique())
items = set(items)

# Create a mapping of grocery items to an integer ID and vice versa
d = {}
d1= {}
key = 0
for i in items:
  key +=1
  d[i] = key
  d1[key] = i

# Apply mapping to dataset
groceries = groceries.applymap(lambda s: d.get(s) if s in d else s)

# Create list of transactions from dataset
transactions = []
for i in range((groceries.shape[0])):
  transaction = [x for x in list(groceries.iloc[i, :]) if x != 1]
  transactions.append(transaction) 

print(transactions)

[[27, 51, 8, 34, 136, 106, 179, 98, 146, 164, 89], [123, 99, 140, 141], [159, 156, 20], [62], [11, 156, 29, 178], [22, 62, 32, 31], [62, 23, 156, 125, 111], [124], [22, 28, 124, 60, 72], [50], [62, 130], [159, 22, 14, 180, 79], [123, 159, 62, 23, 94, 156, 150, 180, 151], [68], [122, 124, 93], [134, 159], [23, 84, 148, 115], [148], [171], [79], [168], [22], [119, 153], [62], [159, 29, 127, 52, 115], [159, 37, 22, 117, 124, 150, 87, 138, 100, 101, 152], [180, 59], [156], [104, 124, 93, 79], [22], [16, 93, 148, 59, 115, 81], [156, 92, 180, 168], [88, 22, 124, 77, 180, 165, 120], [37, 22, 62, 92, 84], [25, 55, 22, 62, 49, 54, 93, 111], [68, 58, 52], [153, 93], [148], [59], [37, 22, 62, 36], [123, 167, 115], [104, 124, 93, 59, 168, 81], [159, 37, 62, 156, 26, 16, 153, 84, 130, 20, 93, 100], [55, 156], [59], [119, 156, 29, 17, 124, 180, 93, 115], [20], [153, 180], [124], [147], [37, 22, 23, 94, 49, 28, 154, 124, 180, 31], [104, 124, 132, 115], [59], [131, 58, 22, 62], [118, 159, 22, 94, 26, 

In [105]:
#---- A Priori algorithm and supporting functions
# https://github.com/pbharrin/machinelearninginaction3x/blob/master/Ch11/apriori.py

def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
                
    C1.sort()
    return list(map(frozenset, C1))#use frozen set so we
                            #can use it as a key in a dict    

def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt: ssCnt[can]=1
                else: ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)
        supportData[key] = support
    return retList, supportData

def aprioriGen(Lk, k): #creates Ck
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

def apriori(dataSet, minSupport):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD
    bigRuleList = []
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList         

def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] #create new list to return
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            # print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
            
def pntRules(ruleList, itemMeaning):
    for ruleTup in ruleList:
        for item in ruleTup[0]:
          antecedent = item
        for item in ruleTup[1]:
          consequent = item
        print(itemMeaning[antecedent], "-------->", itemMeaning[consequent])
        print("confidence: %f" % ruleTup[2])
        print()       #print a blank line

In [111]:
 
 L,suppData=apriori(transactions, 0.05)

# L contains some lists of frequent itemsets that met a minimum support of 0.5. 
print(len(L))
print(L)

 rules=generateRules(L,suppData, minConf=0.1)
 print(len(rules))
 pntRules(rules, d1)

3
[[frozenset({26}), frozenset({49}), frozenset({25}), frozenset({81}), frozenset({16}), frozenset({104}), frozenset({59}), frozenset({37}), frozenset({153}), frozenset({148}), frozenset({115}), frozenset({122}), frozenset({93}), frozenset({68}), frozenset({94}), frozenset({180}), frozenset({60}), frozenset({124}), frozenset({23}), frozenset({22}), frozenset({11}), frozenset({62}), frozenset({159}), frozenset({156}), frozenset({20}), frozenset({140}), frozenset({123})], [frozenset({124, 62}), frozenset({156, 62}), frozenset({62, 22})], []]
6
whole milk --------> rolls/buns
confidence: 0.217668

rolls/buns --------> whole milk
confidence: 0.304396

whole milk --------> yogurt
confidence: 0.218066

yogurt --------> whole milk
confidence: 0.400292

other vegetables --------> whole milk
confidence: 0.386758

whole milk --------> other vegetables
confidence: 0.292877

