In [160]:
import pandas as pd
import itertools
import requests

In [161]:
MIN_SUP = 0.03

In [162]:
bakery_items_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/goods.csv"
bakery_dataset5000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/5000/5000-out1.csv"
bakery_dataset20000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/20000/20000-out1.csv"
bakery_dataset75000_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/75000/75000-out1.csv"
authors_list_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/authorlist.psv"
bingo_dataset_url = "http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/bingoBaskets.csv"

In [163]:
def from_url(url):
    return requests.get(url).text

def parse_csv(csv_raw):
    lines = csv_raw.strip().split('\n')
    max_len = 0
    index = []
    rows = []
    for line in lines:
        parts = list(map(int, line.strip().split(",")))
        index.append(parts[0])
        row = parts[1:]
        if len(row) > max_len:
            max_len = len(row)
        rows.append(row)
    
    for row in rows:
        for i in range(len(row), max_len):
            row.append(-1)
    
    return pd.DataFrame(rows, index = index)

In [164]:
def parse_psv(psv_raw):
    lines = psv_raw.strip().split("\n")
    index = []
    columns = ["Name"]
    rows = []
    for line in lines:
        parts = line.split("|")
        index.append(int(parts[0].strip()))
        rows.append(parts[1].strip())
    return pd.DataFrame(rows, columns=columns, index=index)

In [165]:
bakery_items = set(pd.read_csv(bakery_items_url)["Id"])
bingo_items = set(parse_psv(from_url(authors_list_url)).index)

In [166]:
# given a set of frequent itemsets F and a candidate 
# frequent item set of size k, checks whether all
# k-1 size subsets are in F
def is_valid_candidate(F, u):
    for elem in u:
        if (u - {elem}) not in F:
            return False
    
    return True

# Given a set of frequent itemsets F and a size k,
# constructs all possible k+1 sized candidate itemsets
def candidate_gen(F, k):
    candidates = set()
    
    k_sized_sets = list(filter(lambda s: len(s) == k, F))
    for (first, second) in itertools.combinations(k_sized_sets, r=2):
        joined = first.union(second)
        if len(joined) == k+1 and is_valid_candidate(F, joined):
            candidates.add(frozenset(joined))

    return candidates

In [167]:
def check_subset(row, s):
    return s.issubset(set(row))

In [168]:
def support(T, iset):
    support = 0
    for idx in T.index:
        rowset = set(T.loc[idx])
        if iset.issubset(rowset):
            support += 1
            
    return support

In [169]:
def apriori(T, I, minSup):
    counts = {}
    flags = {}
    k = 2
    n_rows = len(T.index)
    F_cur = {frozenset({i}) for i in I if support(T, {i}) / n_rows >= minSup}
    F = F_cur
        
    while len(F_cur) > 0:
        for iset in F_cur:
            flags[iset] = True
            
        candidates = candidate_gen(F_cur, k-1)
        for c in candidates:
            counts[c] = 0
        for idx in T.index:
            row = T.loc[idx]
            for c in candidates:
                if check_subset(row, c):
                    counts[c] += 1
       
        F_next = {c for c in candidates if counts[c] / n_rows >= minSup}
        for s1 in F_cur:
            for s2 in F_next:
                if s1.issubset(s2):
                    flags[s1] = False
                    
        F_cur = F_next
        F = F.union(F_cur)
        k += 1
    
    return {iset for iset in F if flags[iset]}

In [182]:
df_bakery = parse_csv(from_url(bakery_dataset5000_url))
apriori(df_bakery, bakery_items, 0.02)

{frozenset({0, 2, 46}),
 frozenset({10}),
 frozenset({25}),
 frozenset({30}),
 frozenset({4, 9}),
 frozenset({3, 18, 35}),
 frozenset({12, 31, 36, 48}),
 frozenset({39}),
 frozenset({8}),
 frozenset({27, 28}),
 frozenset({1, 19}),
 frozenset({13}),
 frozenset({7, 11, 37, 45}),
 frozenset({26}),
 frozenset({5, 22}),
 frozenset({17, 29, 47}),
 frozenset({38}),
 frozenset({33, 42}),
 frozenset({34}),
 frozenset({7, 15, 49}),
 frozenset({20}),
 frozenset({14, 44}),
 frozenset({6}),
 frozenset({23, 24, 40, 41, 43}),
 frozenset({21}),
 frozenset({16, 32, 45})}

In [184]:
df_bakery = parse_csv(from_url(bakery_dataset20000_url))
apriori(df_bakery, bakery_items, 0.04)

{frozenset({40}),
 frozenset({0, 46}),
 frozenset({49}),
 frozenset({43}),
 frozenset({10}),
 frozenset({25}),
 frozenset({1}),
 frozenset({36}),
 frozenset({30}),
 frozenset({4, 9}),
 frozenset({3, 18, 35}),
 frozenset({23}),
 frozenset({39}),
 frozenset({14}),
 frozenset({24}),
 frozenset({8}),
 frozenset({27, 28}),
 frozenset({45}),
 frozenset({13}),
 frozenset({31}),
 frozenset({41}),
 frozenset({48}),
 frozenset({11}),
 frozenset({17}),
 frozenset({44}),
 frozenset({26}),
 frozenset({47}),
 frozenset({37}),
 frozenset({5, 22}),
 frozenset({15}),
 frozenset({12}),
 frozenset({38}),
 frozenset({33, 42}),
 frozenset({2}),
 frozenset({16, 32}),
 frozenset({34}),
 frozenset({20}),
 frozenset({7}),
 frozenset({29}),
 frozenset({6}),
 frozenset({19}),
 frozenset({21})}

In [171]:
df_bingo = parse_csv(from_url(bingo_dataset_url))
apriori(df_bingo, bingo_items, 0.3)

{frozenset({91}), frozenset({743}), frozenset({1109}), frozenset({644})}