In [1]:
import pandas as pd
import itertools
import requests

In [2]:
MIN_SUP = 0.03

In [3]:
bakery_items_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/goods.csv"
bakery_dataset5000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/5000/5000-out1.csv"
bakery_dataset20000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/20000/20000-out1.csv"
bakery_dataset75000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/75000/75000-out1.csv"
authors_list_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/authorlist.psv"
bingo_dataset_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/bingoBaskets.csv"
test_dataset = "out1.csv"

In [4]:
def from_url(url):
    return requests.get(url).text

def parse_csv(csv_raw):
    lines = csv_raw.strip().split('\n')
    max_len = 0
    index = []
    rows = []
    for line in lines:
        parts = list(map(int, line.strip().split(",")))
        index.append(parts[0])
        row = parts[1:]
        if len(row) > max_len:
            max_len = len(row)
        rows.append(row)
    
    for row in rows:
        for i in range(len(row), max_len):
            row.append(-1)
    
    return pd.DataFrame(rows, index = index)

In [5]:
def parse_psv(psv_raw):
    lines = psv_raw.strip().split("\n")
    index = []
    columns = ["Name"]
    rows = []
    for line in lines:
        parts = line.split("|")
        index.append(int(parts[0].strip()))
        rows.append(parts[1].strip())
    return pd.DataFrame(rows, columns=columns, index=index)

In [6]:
bakery_items = set(pd.read_csv(bakery_items_url)["Id"])
bingo_items = set(parse_psv(from_url(authors_list_url)).index)

In [7]:
# given a set of frequent itemsets F and a candidate 
# frequent item set of size k, checks whether all
# k-1 size subsets are in F
def is_valid_candidate(F, u):
    for elem in u:
        if (u - {elem}) not in F:
            return False
    
    return True

# Given a set of frequent itemsets F and a size k,
# constructs all possible k+1 sized candidate itemsets
def candidate_gen(F, k):
    candidates = set()
    
    k_sized_sets = list(filter(lambda s: len(s) == k, F))
    for (first, second) in itertools.combinations(k_sized_sets, r=2):
        joined = first.union(second)
        if len(joined) == k+1 and is_valid_candidate(F, joined):
            candidates.add(frozenset(joined))

    return candidates

def check_subset(row, s):
    return s.issubset(set(row))

In [8]:
def support(T, iset):
    support = 0
    for idx in T.index:
        rowset = set(T.loc[idx])
        if iset.issubset(rowset):
            support += 1
            
    return support

In [9]:
def apriori(T, I, minSup):
    counts = {}
    flags = {}
    supports = {}
    k = 2
    n_rows = len(T.index)
    #F_cur = {frozenset({i}) for i in I if support(T, {i}) / n_rows >= minSup}
    
    F_cur = set()
    for i in I:
        supp = support(T, {i})
        supports[frozenset({i})] = supp
        if supp / n_rows >= minSup:
            F_cur.add(frozenset({i}))
    
    F = F_cur
        
    while len(F_cur) > 0:
        for iset in F_cur:
            flags[iset] = True
            
        candidates = candidate_gen(F_cur, k-1)
        for c in candidates:
            counts[c] = 0
        for idx in T.index:
            row = T.loc[idx]
            for c in candidates:
                if check_subset(row, c):
                    counts[c] += 1

        F_next = set()
        for c in candidates:
            supp = counts[c]
            supports[c] = supp
            if supp / n_rows >= minSup:
                F_next.add(c)

        #F_next = {c for c in candidates if counts[c] / n_rows >= minSup}
        for s1 in F_cur:
            for s2 in F_next:
                if s1.issubset(s2):
                    flags[s1] = False
                    
        F_cur = F_next
        F = F.union(F_cur)
        k += 1
    
    return ({iset for iset in F if flags[iset]}, supports)

In [14]:
def confidence(T, supports, rule):
    return supports[rule[0].union({rule[1]})] / supports[rule[0]]

def association_rules(T, iset, supports, minConf):
    # we are only interested in item sets that have 2 or more elements
    # an assocation rule of the type a->a with 100% conf isn't interesting
    if len(iset) < 2:
        return []
    
    rules = []
    for e in iset:
        rule = (iset - {e}, e)
        if confidence(T, supports, rule) >= minConf:
            rules.append(rule)
    
    return rules

In [19]:
def format_rule(rule):
    return "{} -> {}".format(",".join(map(str, rule[0])), str(rule[1]))

class Results:
    def __init__(self, T, isets, supports, minConf):
        self.isets = isets
        self.T = T
        self.supports = supports
        self.minConf = minConf
    
    def find_association_rules(self):
        rules = []
        for iset in isets:
            rules += association_rules(self.T, iset, self.supports, self.minConf)
        
        self.rules = rules
            
    def print_output(self):
        self.find_association_rules()
        out = ""
        for rule in self.rules:
            rule_iset = rule[0].union({rule[1]})
            out += "{} [supp={} conf={}]\n".format(format_rule(rule), supports[rule_iset], confidence(self.T, self.supports, rule))
        
        print(out)

df_test = parse_csv(open(test_dataset).read())
(isets, supports) = apriori(df_test, bakery_items, 0.1)
Results(df_test, isets, supports, 0.75).print_output()

15 -> 36 [supp=139 conf=0.7513513513513513]
36 -> 15 [supp=139 conf=0.7988505747126436]
1 -> 49 [supp=127 conf=0.8141025641025641]
49 -> 1 [supp=127 conf=0.7839506172839507]
12,14 -> 16 [supp=257 conf=0.9589552238805971]
16,14 -> 12 [supp=257 conf=0.9961240310077519]
16,12 -> 14 [supp=257 conf=0.9922779922779923]
22 -> 9 [supp=181 conf=0.8418604651162791]
9 -> 22 [supp=181 conf=0.8044444444444444]



In [20]:
df_bakery = parse_csv(from_url(bakery_dataset5000_url))
(isets, supports) = apriori(df_bakery, bakery_items, 0.02); isets

{frozenset({0, 2, 46}),
 frozenset({10}),
 frozenset({25}),
 frozenset({30}),
 frozenset({4, 9}),
 frozenset({3, 18, 35}),
 frozenset({12, 31, 36, 48}),
 frozenset({39}),
 frozenset({8}),
 frozenset({27, 28}),
 frozenset({1, 19}),
 frozenset({13}),
 frozenset({7, 11, 37, 45}),
 frozenset({26}),
 frozenset({5, 22}),
 frozenset({17, 29, 47}),
 frozenset({38}),
 frozenset({33, 42}),
 frozenset({34}),
 frozenset({7, 15, 49}),
 frozenset({20}),
 frozenset({14, 44}),
 frozenset({6}),
 frozenset({23, 24, 40, 41, 43}),
 frozenset({21}),
 frozenset({16, 32, 45})}

In [29]:
Results(df_bakery, isets, supports, 0.9).print_output()

2,46 -> 0 [supp=156 conf=0.9017341040462428]
0,2 -> 46 [supp=156 conf=0.9122807017543859]
35,3 -> 18 [supp=204 conf=0.9444444444444444]
18,3 -> 35 [supp=204 conf=0.9357798165137615]
48,12,31 -> 36 [supp=114 conf=0.991304347826087]
48,36,31 -> 12 [supp=114 conf=0.991304347826087]
48,36,12 -> 31 [supp=114 conf=1.0]
11,45,7 -> 37 [supp=154 conf=1.0]
45,11,37 -> 7 [supp=154 conf=1.0]
45,37,7 -> 11 [supp=154 conf=1.0]
17,29 -> 47 [supp=133 conf=0.9300699300699301]
49,7 -> 15 [supp=143 conf=0.9662162162162162]
49,15 -> 7 [supp=143 conf=0.910828025477707]
24,41,43,23 -> 40 [supp=106 conf=1.0]
40,24,43,23 -> 41 [supp=106 conf=1.0]
40,41,43,24 -> 23 [supp=106 conf=1.0]
40,41,43,23 -> 24 [supp=106 conf=1.0]
16,45 -> 32 [supp=164 conf=0.9371428571428572]
32,45 -> 16 [supp=164 conf=0.9425287356321839]



In [33]:
df_bakery_20000 = parse_csv(from_url(bakery_dataset20000_url))
(isets, supports) = apriori(df_bakery_20000, bakery_items, 0.02)

In [34]:
print(isets)
Results(df_bakery_20000, isets, supports, 0.9).print_output()

{frozenset({0, 2, 46}), frozenset({10}), frozenset({25}), frozenset({30}), frozenset({9, 4}), frozenset({3, 18, 35}), frozenset({48, 36, 12, 31}), frozenset({39}), frozenset({8}), frozenset({27, 28}), frozenset({1, 19}), frozenset({13}), frozenset({37, 7, 11, 45}), frozenset({26}), frozenset({5, 22}), frozenset({17, 29, 47}), frozenset({38}), frozenset({33, 42}), frozenset({34}), frozenset({49, 15, 7}), frozenset({20}), frozenset({44, 14}), frozenset({6}), frozenset({40, 41, 43, 23, 24}), frozenset({21}), frozenset({32, 16, 45})}
2,46 -> 0 [supp=678 conf=0.9495798319327731]
0,2 -> 46 [supp=678 conf=0.9456066945606695]
35,3 -> 18 [supp=820 conf=0.9457900807381776]
18,3 -> 35 [supp=820 conf=0.9392898052691867]
48,12,31 -> 36 [supp=420 conf=0.9929078014184397]
48,36,31 -> 12 [supp=420 conf=0.9882352941176471]
48,36,12 -> 31 [supp=420 conf=0.995260663507109]
11,45,7 -> 37 [supp=562 conf=0.9964539007092199]
45,11,37 -> 7 [supp=562 conf=0.9946902654867257]
45,37,7 -> 11 [supp=562 conf=0.9982

In [None]:
"""
Test Dataset
15 -> 36 [supp=0.139 conf=0.7513513513513513]
36 -> 15 [supp=0.139 conf=0.7988505747126436]
1 -> 49 [supp=0.127 conf=0.8141025641025641]
49 -> 1 [supp=0.127 conf=0.7839506172839507]
12,14 -> 16 [supp=0.257 conf=0.9589552238805971]
16,14 -> 12 [supp=0.257 conf=0.9961240310077519]
16,12 -> 14 [supp=0.257 conf=0.9922779922779923]
22 -> 9 [supp=0.181 conf=0.8418604651162791]
9 -> 22 [supp=0.181 conf=0.8044444444444444]
"""

In [35]:
df_bakery_75000 = parse_csv(from_url(bakery_dataset75000_url))
(isets, supports) = apriori(df_bakery_75000, bakery_items, 0.02)

In [36]:
print(isets)
Results(df_bakery_75000, isets, supports, 0.9).print_output()

{frozenset({0, 2, 46}), frozenset({10}), frozenset({25}), frozenset({30}), frozenset({9, 4}), frozenset({3, 18, 35}), frozenset({48, 36, 12, 31}), frozenset({39}), frozenset({8}), frozenset({27, 28}), frozenset({1, 19}), frozenset({13}), frozenset({37, 7, 11, 45}), frozenset({26}), frozenset({5, 22}), frozenset({17, 29, 47}), frozenset({38}), frozenset({33, 42}), frozenset({34}), frozenset({49, 15, 7}), frozenset({20}), frozenset({44, 14}), frozenset({6}), frozenset({40, 41, 43, 23, 24}), frozenset({21}), frozenset({32, 16, 45})}
2,46 -> 0 [supp=2504 conf=0.9474082482027999]
0,2 -> 46 [supp=2504 conf=0.9395872420262664]
35,3 -> 18 [supp=3083 conf=0.9553765106910443]
18,3 -> 35 [supp=3083 conf=0.9477405471872118]
48,12,31 -> 36 [supp=1544 conf=0.9910141206675225]
48,36,31 -> 12 [supp=1544 conf=0.9897435897435898]
48,36,12 -> 31 [supp=1544 conf=0.9929260450160772]
11,45,7 -> 37 [supp=2094 conf=0.9938300901756051]
45,11,37 -> 7 [supp=2094 conf=0.9952471482889734]
45,37,7 -> 11 [supp=2094 

In [55]:
df_bingo = parse_csv(from_url(bingo_dataset_url))
(isets, supports) = apriori(df_bingo, bingo_items, 0.04)

In [57]:
Results(df_bingo, isets, supports, 0.9).print_output()

413 -> 306 [supp=13 conf=1.0]
306 -> 413 [supp=13 conf=1.0]
564,13 -> 91 [supp=12 conf=0.9230769230769231]
1217,1029,743 -> 91 [supp=11 conf=0.9166666666666666]
13,1029,743 -> 445 [supp=10 conf=0.9090909090909091]
433 -> 8 [supp=14 conf=1.0]
240,743 -> 1109 [supp=11 conf=1.0]
802,564 -> 91 [supp=10 conf=0.9090909090909091]



In [44]:
df_bingo

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,6,13,63,70,91,178,218,347,368,445,...,1279,1387,-1,-1,-1,-1,-1,-1,-1,-1
1,3,88,91,95,117,136,166,188,358,449,...,1271,1291,1311,1329,-1,-1,-1,-1,-1,-1
2,32,91,113,123,166,188,240,259,357,436,...,1405,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,6,18,48,88,166,194,197,281,292,315,...,1314,1321,1337,1362,1377,1406,-1,-1,-1,-1
4,1,3,91,95,166,235,249,358,453,576,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,1,6,42,49,91,169,199,305,368,422,...,1291,1387,1403,-1,-1,-1,-1,-1,-1,-1
239,2,13,48,68,88,109,195,226,240,387,...,1279,-1,-1,-1,-1,-1,-1,-1,-1,-1
240,27,29,48,75,172,195,251,265,465,495,...,1366,-1,-1,-1,-1,-1,-1,-1,-1,-1
241,143,258,260,340,397,475,553,628,687,727,...,1405,-1,-1,-1,-1,-1,-1,-1,-1,-1
