In [19]:
import pandas as pd
import numpy as np
import urllib.request as url
import itertools
import re

In [20]:
bakery5 = pd.read_csv("http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/5000/5000-out2.csv", names=np.arange(0,50), index_col=0)
bakery20 = pd.read_csv("http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/20000/20000-out2.csv", names=np.arange(0,50), index_col=0)
bakery75 = pd.read_csv("http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/75000/75000-out2.csv", names=np.arange(0,50), index_col=0)

food_labels = pd.read_csv("http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BAKERY/goods.csv")
authors_labels = pd.read_csv("http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/authorlist.psv", delimiter="|", names=["Id","Authors"])

authors_dict = {}
max = 0
authors_file = url.urlopen("http://users.csc.calpoly.edu/~dekhtyar/466-Fall2021/data/BINGO/bingoBaskets.csv")
for line in authors_file:
  data = line.decode("utf-8").split(",", 1)
  data[0] = int(data[0])
  data[1] = [int(x.strip()) for x in data[1].split(", ")]
  for book in data[1]:
    if book > max:
      max = book
  authors_dict[data[0]] = data[1]

authors_list = np.zeros((len(authors_dict), max))
for key in authors_dict.keys():
  for book in authors_dict[key]:
    authors_list[key][book - 1] = 1

authors = pd.DataFrame(authors_list)
authors.columns = np.arange(1, len(authors_list[0]) + 1)
authors = authors.drop(1, axis=1)

In [21]:
def update_skyline(skyline, combo, k):
    for subset in combo[1]:
        if subset in skyline[k-1]:
            skyline[k-1].discard(subset)
    if k not in skyline:
        skyline[k] = set()
    skyline[k].add(combo[0])

def get_sets(s, n):
    return [tuple(item) for item in itertools.combinations(s, n)]

def candidate_gen(items, k, rules=False):
    combos = []
    seen = set()
    for i in range(len(items)):
        for j in range(i + 1, len(items)):
            item = list(set.union(set(items[i]), set(items[j])))
            item.sort()
            item = tuple(item)
            if len(item) == k:
                add = True
                subsets = get_sets(item, k-1)
                for subset in subsets:
                    if subset not in items:
                        add = False
                        break
                if add and item not in seen:
                    if rules:
                        combos.append(item)
                    else:
                        combos.append((item, subsets))
                    seen.add(item)
    return combos

def apriori(dataset, items, min_r_support):
    n = dataset.shape[0]
    k = 1
    items.sort()

    supports = {}
    skyline = {k: set()}
    last = []
    for item in items:
        if dataset[item].sum() / n >= min_r_support:
            skyline[1].add((item, ))
            supports[item] = dataset[item].sum() / n
            last.append((item, ))

    while k <= len(items):
        k += 1
        combos = candidate_gen(last, k)
        last = []
        for combo in combos:
            total = None
            for item in combo[0]:
                if total is None:
                    total = dataset[item].copy()
                else:
                    total *= dataset[item].copy()
            if total.sum() / n >= min_r_support:
                update_skyline(skyline, combo, k)
                supports[combo[0]] = total.sum() / n
                last.append(combo[0])
        if len(last) == 0:
            break

    return skyline, supports

In [29]:
class GenRules():
    def __init__(self, table, freq_sets, min_conf):
        self.table = table.copy()
        self.freq_sets = freq_sets
        self.min_conf = min_conf

    def get_total(self, freq_set):
        total = None
        for item in freq_set:
            if total is None:
                total = self.table[item].copy()
            else:
                total *= self.table[item].copy()
        return total

    def confidence(self, item_set, subset):
        item_set_total = self.get_total(item_set)
        subset_total = self.get_total(subset)
        return item_set_total.sum() / subset_total.sum()

    def ap_gen_rules(self, item_set, rules, size):
        new_rules = []
        if len(item_set) > size and len(rules) > 0:
            candidates = candidate_gen(rules, size, True)
            new_candidates = []
            for candidate in candidates:
                subset = tuple([i for i in item_set if i not in candidate])
                if self.confidence(item_set, subset) >= self.min_conf:
                    new_rules.append(candidate)
                    new_candidates.append(candidate)
            new_rules += self.ap_gen_rules(item_set, new_candidates, size + 1)
        return new_rules

    def gen_rules(self):
        all_rules = {}
        for freq_set in self.freq_sets:
            if len(freq_set) > 1:
                rules = []
                for item in freq_set:
                    subset = tuple([i for i in freq_set if i != item])
                    if self.confidence(freq_set, subset) >= self.min_conf:
                        rules.append((item, ))
                rules += self.ap_gen_rules(freq_set, rules, 2)
                all_rules[freq_set] = rules
        return self.skyline(all_rules)

    def skyline(self, rules):
        new_rules = {}
        for key in rules.keys():
            rule_list = rules[key]
            if len(rule_list) == 0:
                continue
            length = len(rule_list[-1])
            add_list = set()
            library = {rule: True for rule in rule_list}
            for rule in rule_list[::-1]:
                for rule_2 in rule_list:
                    if rule_2 == rule:
                        continue
                    add = False
                    for val in rule_2:
                        if val not in rule:
                            add = True
                    if add:
                        add_list.add(rule_2)
                    else:
                        library[rule_2] = False
            for rule in library.keys():
                if not library[rule]:
                    add_list.remove(rule)
            if len(add_list) > 0:
              new_rules[key] = list(add_list)
        return new_rules

In [30]:
def get_confidence(table, main_set, subset):
    def get_union(table, mset):
        total = None
        for item in mset:
            if total is None:
                total = table[item].copy()
            else:
                total *= table[item].copy()
        return total
    main_set_total = get_union(table.copy(), main_set)
    subset_total = get_union(table.copy(), tuple([i for i in main_set if i not in subset]))
    return main_set_total.sum() / subset_total.sum()

In [31]:
def make_output(data, ap, support, labels, columns, minConf, authors=False):
    for key in ap.keys():
        for i in ap[key]:
            t = GenRules(data, [i], minConf).gen_rules()
            seen = set()
            for items in t.keys():
                intersect = []
                for subset in t[items]:
                    if isinstance(intersect, list) and len(t[items]) > 1:
                        intersect = set(subset)
                    else:
                        intersect = intersect.intersection(set(subset))

                items_copy = set(items)
                for inter in intersect:
                    items_copy.remove(inter)

                for item in items_copy:
                    string = []
                    vals = labels[labels["Id"] == item][columns].values.tolist()
                    for val in vals:
                        new_string = " ".join(val)
                        string.append(re.sub(r"[^\w\s]", "", new_string))

                    new_items = []
                    for item2 in items:
                        if item2 != item:
                            vals = labels[labels["Id"] == item2][columns].values.tolist()
                            for val in vals:
                                new_string = " ".join(val)
                                new_items.append(re.sub(r"[^\w\s]", "", new_string))

                    conf = get_confidence(data, items, [item])
                    if conf > minConf:
                        if not authors:
                            print("{:>80}  {:>10}  {:<30} Support={:>4.2f}   Confidence={:>4.2f}".format(", ".join(new_items), "------->", ", ".join(string), support[items]*100, conf * 100))
                        else:
                            print("{:>40}  {:>10}  {:<30} Support={:>4.2f}   Confidence={:>4.2f}".format(re.sub(r"\s\s", " & ", new_items[0]), "------->", re.sub(r"\s\s", " & ", string[0]), support[items]*100, conf * 100))

In [32]:
ap_auth, support_auth = apriori(authors, authors.columns.tolist(), 0.1)
ap_bake, support_bake = apriori(bakery20, bakery5.columns.tolist(), 0.02)

In [None]:
make_output(authors, ap_auth, support_auth, authors_labels, ["Authors"], .35, authors=True)

                       Sanderson Brandon    ------->   Bancroft Josiah                         Support=20.58   Confidence=49.02
                         Bancroft Josiah    ------->   Sanderson Brandon                       Support=20.58   Confidence=49.50
                           Lawrence Mark    ------->   Rowe Andrew                             Support=11.52   Confidence=37.33
                             Rowe Andrew    ------->   Lawrence Mark                           Support=11.52   Confidence=47.46
                             Gaiman Neil    ------->   Pratchett Terry                         Support=11.93   Confidence=44.62
                         Pratchett Terry    ------->   Gaiman Neil                             Support=11.93   Confidence=42.65
                           Lawrence Mark    ------->   Sanderson Brandon                       Support=16.05   Confidence=52.00
                       Sanderson Brandon    ------->   Lawrence Mark                           Support=1

In [41]:
make_output(bakery5, ap_bake, support_bake, food_labels, ["Flavor","Food"], .84)

                                                   Apricot Croissant, Hot Coffee    ------->  Blueberry Tart                 Support=3.26   Confidence=94.25
                                                      Blueberry Tart, Hot Coffee    ------->  Apricot Croissant              Support=3.26   Confidence=93.71
                                                         Opera Cake, Cherry Tart    ------->  Apricot Danish                 Support=4.10   Confidence=93.58
                                                      Opera Cake, Apricot Danish    ------->  Cherry Tart                    Support=4.10   Confidence=94.44
                                                   Casino Cake, Chocolate Coffee    ------->  Chocolate Cake                 Support=3.39   Confidence=90.17
                                                     Chocolate Cake, Casino Cake    ------->  Chocolate Coffee               Support=3.39   Confidence=91.23
                                                  Coffee E

In [None]:
def create_item_sets(ap, support, labels):
  for key in ap.keys():
    for item1 in ap[key]:
      string = []
      if len(item1) > 1:
        for item2 in item1:
          string.append(str(labels[labels["Id"] == item2]["Authors"].values).strip("[]").strip("'").replace(",",""))
        print(", ".join(string), "[Support=" + str(round(support[item1],2)) + "]")

create_item_sets(ap_auth, support_auth, authors_labels)

 Bancroft Josiah,  Sanderson Brandon [Support=0.21]
 Bancroft Josiah,  Chambers Becky [Support=0.11]
 Bancroft Josiah,  Eames Nicholas [Support=0.14]
 Bancroft Josiah,  Mieville China [Support=0.12]
 Addison Katherine / Monette Sarah,  Lawrence Mark [Support=0.11]
 Bancroft Josiah,  Jemisin N. K. [Support=0.14]
 Pratchett Terry,  Sanderson Brandon [Support=0.14]
 Lawrence Mark,  Rowe Andrew [Support=0.12]
 Gaiman Neil,  Pratchett Terry [Support=0.12]
 King Stephen,  Sanderson Brandon [Support=0.1]
 Lawrence Mark,  Sanderson Brandon [Support=0.16]
 Hobb Robin / Lindholm Megan,  Sanderson Brandon [Support=0.13]
 Novik Naomi,  Sanderson Brandon [Support=0.15]
 Sanderson Brandon,  Sullivan Michael J. [Support=0.13]
 Gaiman Neil,  Sanderson Brandon [Support=0.15]
 Bancroft Josiah,  Brennan Marie [Support=0.12]
 Addison Katherine / Monette Sarah,  Bancroft Josiah [Support=0.14]
 Rowe Andrew,  Sanderson Brandon [Support=0.14]
 Sanderson Brandon,  VanderMeer Jeff [Support=0.11]
 Lawrence Mark,