In [3]:
import pandas as pd
import itertools
import requests

In [4]:
MIN_SUP = 0.03

In [5]:
bakery_items_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/goods.csv"
bakery_dataset5000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/5000/5000-out1.csv"
bakery_dataset20000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/20000/20000-out1.csv"
bakery_dataset75000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/75000/75000-out1.csv"
authors_list_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/authorlist.psv"
bingo_dataset_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/bingoBaskets.csv"
test_dataset = "out1.csv"

In [6]:
def from_url(url):
    return requests.get(url).text

def parse_csv(csv_raw):
    lines = csv_raw.strip().split('\n')
    max_len = 0
    index = []
    rows = []
    for line in lines:
        parts = list(map(int, line.strip().split(",")))
        index.append(parts[0])
        row = parts[1:]
        if len(row) > max_len:
            max_len = len(row)
        rows.append(row)
    
    for row in rows:
        for i in range(len(row), max_len):
            row.append(-1)
    
    return pd.DataFrame(rows, index = index)

In [7]:
def parse_psv(psv_raw):
    lines = psv_raw.strip().split("\n")
    index = []
    columns = ["Name"]
    rows = []
    for line in lines:
        parts = line.split("|")
        index.append(int(parts[0].strip()))
        rows.append(parts[1].strip())
    return pd.DataFrame(rows, columns=columns, index=index)

In [8]:
bakery_items_table = pd.read_csv(bakery_items_url)
bakery_items = set(bakery_items_table["Id"])

bakery_items_table["Name"] = bakery_items_table["Flavor"].str[1:-1] + " " + bakery_items_table["Food"].str[1:-1]
bakery_items_table = bakery_items_table.filter(["Name"])

bingo_items_table = parse_psv(from_url(authors_list_url))
bingo_items = set(bingo_items_table.index)

In [9]:
# given a set of frequent itemsets F and a candidate 
# frequent item set of size k, checks whether all
# k-1 size subsets are in F
def is_valid_candidate(F, u):
    for elem in u:
        if (u - {elem}) not in F:
            return False
    
    return True

# Given a set of frequent itemsets F and a size k,
# constructs all possible k+1 sized candidate itemsets
def candidate_gen(F, k):
    candidates = set()
    
    k_sized_sets = list(filter(lambda s: len(s) == k, F))
    for (first, second) in itertools.combinations(k_sized_sets, r=2):
        joined = first.union(second)
        if len(joined) == k+1 and is_valid_candidate(F, joined):
            candidates.add(frozenset(joined))

    return candidates

def check_subset(row, s):
    return s.issubset(set(row))

In [10]:
def support(T, iset):
    support = 0
    for idx in T.index:
        rowset = set(T.loc[idx])
        if iset.issubset(rowset):
            support += 1
            
    return support

In [11]:
def apriori(T, I, minSup):
    counts = {}
    flags = {}
    supports = {}
    k = 2
    n_rows = len(T.index)
    #F_cur = {frozenset({i}) for i in I if support(T, {i}) / n_rows >= minSup}
    
    F_cur = set()
    for i in I:
        supp = support(T, {i})
        supports[frozenset({i})] = supp
        if supp / n_rows >= minSup:
            F_cur.add(frozenset({i}))
    
    F = F_cur
        
    while len(F_cur) > 0:
        for iset in F_cur:
            flags[iset] = True
            
        candidates = candidate_gen(F_cur, k-1)
        for c in candidates:
            counts[c] = 0
        for idx in T.index:
            row = T.loc[idx]
            for c in candidates:
                if check_subset(row, c):
                    counts[c] += 1

        F_next = set()
        for c in candidates:
            supp = counts[c]
            supports[c] = supp
            if supp / n_rows >= minSup:
                F_next.add(c)

        #F_next = {c for c in candidates if counts[c] / n_rows >= minSup}
        for s1 in F_cur:
            for s2 in F_next:
                if s1.issubset(s2):
                    flags[s1] = False
                    
        F_cur = F_next
        F = F.union(F_cur)
        k += 1
    
    return ({iset for iset in F if flags[iset]}, supports)

In [12]:
def confidence(T, supports, rule):
    return supports[rule[0].union({rule[1]})] / supports[rule[0]]

def association_rules(T, iset, supports, minConf):
    # we are only interested in item sets that have 2 or more elements
    # an assocation rule of the type a->a with 100% conf isn't interesting
    if len(iset) < 2:
        return []
    
    rules = []
    for e in iset:
        rule = (iset - {e}, e)
        if confidence(T, supports, rule) >= minConf:
            rules.append(rule)
    
    return rules

In [56]:
def format_rule(rule, item_table):
    return "{} -> {}".format(set(map(lambda e: item_table.loc[e]["Name"], rule[0])), 
                             str(item_table.loc[rule[1]]["Name"]))

def format_iset(iset, item_table):
    return str(set(map(lambda e: item_table.loc[e]["Name"], iset)))

class Results:
    def __init__(self, T, isets, supports, minConf):
        self.isets = isets
        self.T = T
        self.supports = supports
        self.minConf = minConf
    
    def find_association_rules(self):
        rules = []
        for iset in isets:
            rules += association_rules(self.T, iset, self.supports, self.minConf)
        
        self.rules = rules
            
    def print_output(self, item_table):
        self.find_association_rules()
        
        print("Item Sets: \n")
        for iset in self.isets:
            print("itemset={} [supp={}]".format(format_iset(iset, item_table), supports[iset]/float(len(self.T.index))))
        
        print("Assocation Rules: \n")
        for rule in self.rules:
            rule_iset = rule[0].union({rule[1]})
            print("{} [supp={} conf={}]".format(format_rule(rule, item_table), 
                               float(supports[rule_iset])/float(len(self.T.index)), 
                               confidence(self.T, self.supports, rule)))        

df_test = parse_csv(open(test_dataset).read())
(isets, supports) = apriori(df_test, bakery_items, 0.1)
Results(df_test, isets, supports, 0.75).print_output(bakery_items_table)

Item Sets: 

itemset={'Blackberry Tart', 'Apple Danish'} [supp=0.139]
itemset={'Lemon Cake', 'Single Espresso'} [supp=0.127]
itemset={'Apple Tart', 'Berry Tart', 'Blueberry Tart'} [supp=0.257]
itemset={'Gongolais Cookie', 'Napoleon Cake'} [supp=0.181]
Assocation Rules: 

{'Blackberry Tart'} -> Apple Danish [supp=0.139 conf=0.7513513513513513]
{'Apple Danish'} -> Blackberry Tart [supp=0.139 conf=0.7988505747126436]
{'Lemon Cake'} -> Single Espresso [supp=0.127 conf=0.8141025641025641]
{'Single Espresso'} -> Lemon Cake [supp=0.127 conf=0.7839506172839507]
{'Apple Tart', 'Berry Tart'} -> Blueberry Tart [supp=0.257 conf=0.9589552238805971]
{'Berry Tart', 'Blueberry Tart'} -> Apple Tart [supp=0.257 conf=0.9961240310077519]
{'Apple Tart', 'Blueberry Tart'} -> Berry Tart [supp=0.257 conf=0.9922779922779923]
{'Gongolais Cookie'} -> Napoleon Cake [supp=0.181 conf=0.8418604651162791]
{'Napoleon Cake'} -> Gongolais Cookie [supp=0.181 conf=0.8044444444444444]


In [52]:
df_bakery = parse_csv(from_url(bakery_dataset5000_url))
(isets, supports) = apriori(df_bakery, bakery_items, 0.03)

In [53]:
r = Results(df_bakery, isets, supports, 0.9)
r.print_output(bakery_items_table)

Item Sets: 

itemset={'Lemon Lemonade'} [supp=0.0648]
itemset={'Casino Cake', 'Chocolate Cake', 'Chocolate Coffee'} [supp=0.0312]
itemset={'Green Tea'} [supp=0.062]
itemset={'Almond Tart'} [supp=0.0386]
itemset={'Chocolate Meringue'} [supp=0.0452]
itemset={'Almond Croissant'} [supp=0.0456]
itemset={'Chocolate Tart', 'Vanilla Frappuccino'} [supp=0.0348]
itemset={'Napoleon Cake', 'Strawberry Cake'} [supp=0.0422]
itemset={'Apricot Danish', 'Cherry Tart', 'Opera Cake'} [supp=0.0408]
itemset={'Raspberry Cookie'} [supp=0.064]
itemset={'Blueberry Danish'} [supp=0.04]
itemset={'Lemon Cookie'} [supp=0.0642]
itemset={'Apple Tart', 'Apple Croissant'} [supp=0.0316]
itemset={'Vanilla Eclair'} [supp=0.046]
itemset={'Lemon Tart', 'Lemon Cake'} [supp=0.0336]
itemset={'Tuile Cookie', 'Marzipan Cookie'} [supp=0.0496]
itemset={'Apricot Tart'} [supp=0.0422]
itemset={'Raspberry Lemonade'} [supp=0.0678]
itemset={'Almond Twist', 'Hot Coffee', 'Apple Pie', 'Coffee Eclair'} [supp=0.0308]
itemset={'Cherry Soda'

In [54]:
df_bakery_20000 = parse_csv(from_url(bakery_dataset20000_url))
(isets, supports) = apriori(df_bakery_20000, bakery_items, 0.03)

In [55]:
Results(df_bakery_20000, isets, supports, 0.9).print_output(bakery_items_table)

Item Sets: 

itemset={'Lemon Lemonade'} [supp=0.06655]
itemset={'Casino Cake', 'Chocolate Cake', 'Chocolate Coffee'} [supp=0.0339]
itemset={'Hot Coffee', 'Coffee Eclair'} [supp=0.0317]
itemset={'Green Tea'} [supp=0.06215]
itemset={'Almond Tart'} [supp=0.04055]
itemset={'Chocolate Meringue'} [supp=0.0445]
itemset={'Apple Danish'} [supp=0.06755]
itemset={'Almond Croissant'} [supp=0.04205]
itemset={'Chocolate Tart', 'Vanilla Frappuccino'} [supp=0.03675]
itemset={'Napoleon Cake', 'Strawberry Cake'} [supp=0.04455]
itemset={'Apricot Danish', 'Cherry Tart', 'Opera Cake'} [supp=0.041]
itemset={'Raspberry Cookie'} [supp=0.06945]
itemset={'Blueberry Danish'} [supp=0.04115]
itemset={'Lemon Cookie'} [supp=0.06825]
itemset={'Hot Coffee', 'Apple Pie'} [supp=0.03085]
itemset={'Vanilla Eclair'} [supp=0.0427]
itemset={'Lemon Tart', 'Lemon Cake'} [supp=0.037]
itemset={'Tuile Cookie', 'Marzipan Cookie'} [supp=0.04855]
itemset={'Apricot Tart'} [supp=0.04275]
itemset={'Apple Croissant'} [supp=0.071]
itemse

In [None]:
"""
Test Dataset
15 -> 36 [supp=0.139 conf=0.7513513513513513]
36 -> 15 [supp=0.139 conf=0.7988505747126436]
1 -> 49 [supp=0.127 conf=0.8141025641025641]
49 -> 1 [supp=0.127 conf=0.7839506172839507]
12,14 -> 16 [supp=0.257 conf=0.9589552238805971]
16,14 -> 12 [supp=0.257 conf=0.9961240310077519]
16,12 -> 14 [supp=0.257 conf=0.9922779922779923]
22 -> 9 [supp=0.181 conf=0.8418604651162791]
9 -> 22 [supp=0.181 conf=0.8044444444444444]
"""

In [50]:
df_bakery_75000 = parse_csv(from_url(bakery_dataset75000_url))
(isets, supports) = apriori(df_bakery_75000, bakery_items, 0.03)

In [51]:
Results(df_bakery_75000, isets, supports, 0.9).print_output(bakery_items_table)

Item Sets: 

itemset={'Lemon Lemonade'} [supp=0.06824]
itemset={'Casino Cake', 'Chocolate Cake', 'Chocolate Coffee'} [supp=0.03338666666666667]
itemset={'Hot Coffee', 'Coffee Eclair'} [supp=0.03156]
itemset={'Single Espresso'} [supp=0.06797333333333333]
itemset={'Green Tea'} [supp=0.06246666666666666]
itemset={'Almond Tart'} [supp=0.04204]
itemset={'Chocolate Meringue'} [supp=0.041933333333333336]
itemset={'Apple Danish'} [supp=0.06769333333333333]
itemset={'Almond Croissant'} [supp=0.04273333333333333]
itemset={'Chocolate Tart', 'Vanilla Frappuccino'} [supp=0.03596]
itemset={'Napoleon Cake', 'Strawberry Cake'} [supp=0.043146666666666667]
itemset={'Apricot Danish', 'Cherry Tart', 'Opera Cake'} [supp=0.041106666666666666]
itemset={'Raspberry Cookie'} [supp=0.06764]
itemset={'Blueberry Danish'} [supp=0.04409333333333333]
itemset={'Lemon Cookie'} [supp=0.06801333333333333]
itemset={'Hot Coffee', 'Apple Pie'} [supp=0.031026666666666668]
itemset={'Vanilla Eclair'} [supp=0.04252]
itemset={'L

In [86]:
df_bingo = parse_csv(from_url(bingo_dataset_url))
(isets, supports) = apriori(df_bingo, bingo_items, 0.05)

In [87]:
Results(df_bingo, isets, supports, 0.8).print_output(bingo_items_table)

Item Sets: 

itemset={'Herbert, Frank', 'Abercrombie, Joe'} [supp=0.06172839506172839]
itemset={'Sanderson, Brandon', 'Herbert, Frank'} [supp=0.07407407407407407]
itemset={'Bancroft, Josiah', 'Sanderson, Brandon', 'Sullivan, Michael J.'} [supp=0.06995884773662552]
itemset={'Brennan, Marie', 'Aaron, Rachel / Bach, Rachel'} [supp=0.05761316872427984]
itemset={'Sanderson, Brandon', 'Lawrence, Mark', 'Abercrombie, Joe'} [supp=0.06172839506172839]
itemset={'Bancroft, Josiah', 'Anders, Charlie Jane'} [supp=0.05761316872427984]
itemset={'Pratchett, Terry', 'Sanderson, Brandon', 'Sullivan, Michael J.'} [supp=0.053497942386831275]
itemset={'Bancroft, Josiah', 'Gaiman, Neil', 'Sullivan, Michael J.'} [supp=0.05761316872427984]
itemset={'Ball, Krista D. / Ball, K.', 'Eames, Nicholas'} [supp=0.053497942386831275]
itemset={'Bancroft, Josiah', 'Mieville, China', 'Eames, Nicholas'} [supp=0.053497942386831275]
itemset={'Olson, Quenby'} [supp=0.05761316872427984]
itemset={'Bancroft, Josiah', 'Abercrombi