In [15]:
from collections import defaultdict
from itertools import combinations, chain
from csv import reader

import pandas as pd

## Read and load the data

In [16]:
def read_data(fname, delim):
    itemSets = []
    itemSet = set()

    with open(fname, 'r') as file:
        csv_reader = reader(file, delimiter=delim)
        for line in csv_reader:
            line = list(filter(None, line))
            record = set(line)
            for item in record:
                itemSet.add(frozenset([item]))
            itemSets.append(record)
    return itemSet, itemSets

C1_itemset, itemset_list = read_data('tmp.csv', '\t')

## Implement the A-Priori algorithm

In [13]:
# Apriori algorithm using PySpark

def generate_next_candidate_set(prev_frequent_set, length):
    next_candidate_set = [var1 | var2 for index, var1 in enumerate(prev_frequent_set) for var2 in prev_frequent_set[index + 1:] 
                          if list(var1)[:length - 2] == list(var2)[:length - 2]]
    return next_candidate_set
 


def generate_frequent_itemset_k(sc, Ck, shared_itemset, min_supp):
    def get_supp(x):
        x_supp = len([1 for itemset in shared_itemset.value if x.issubset(itemset)])
        if x_supp >= min_supp:
            return x, x_supp
        else:
            return ()
    freq_itemset_k = sc.parallelize(Ck).map(get_supp).filter(lambda x: x != ()).collect()
    return freq_itemset_k



def apriori(sc, itemset_rdd, min_sup, max_k):
    # share the whole itemset with all workers
    shared_itemset = sc.broadcast(itemset_rdd.map(lambda x: set(x)).collect())
    # store for all freq_k
    frequent_itemset = []

    # prepare candidate_1
    k = 1
    c_k = itemset_rdd.flatMap(lambda x: set(x)).distinct().collect()
    c_k = [{x} for x in c_k]

    # when candidate_k is not empty
    while len(c_k) > 0 and k <= max_k:
        # generate freq_k
        f_k = generate_frequent_itemset_k(sc, c_k, shared_itemset, min_sup)

        frequent_itemset.append(f_k)
        # generate candidate_k+1
        k += 1
        c_k = generate_next_candidate_set([set(item) for item in map(lambda x: x[0], f_k)], k)

    sc.stop()
    return frequent_itemset
    
    
    

In [2]:
from pyspark.sql import SparkSession

def create_new_spark_context():
    spark = SparkSession.builder.appName("A-Priori").config("spark.driver.memory", "32g").config("spark.driver.maxResultSize", "0")\
    .master("local[*]").getOrCreate()

    return spark.sparkContext

In [18]:
sc = create_new_spark_context()
records_rdd = sc.textFile("records.txt")

In [20]:
records_rdd = records_rdd.map(lambda x: x.strip().split())
records = records_rdd.collect()

In [22]:
frequentItemSets = apriori(sc, records_rdd, 100, 3)

                                                                                

In [30]:
frequentItemSets[1]

[({'ELE17451', 'SNA90258'}, 113),
 ({'GRO99222', 'SNA90258'}, 156),
 ({'DAI62779', 'SNA90258'}, 114),
 ({'DAI22896', 'GRO73461'}, 304),
 ({'GRO73461', 'SNA69641'}, 150),
 ({'ELE59935', 'GRO73461'}, 116),
 ({'DAI22177', 'GRO73461'}, 248),
 ({'ELE66810', 'GRO73461'}, 228),
 ({'GRO36567', 'GRO73461'}, 117),
 ({'GRO73461', 'SNA55952'}, 117),
 ({'DAI48891', 'GRO73461'}, 117),
 ({'ELE11111', 'GRO73461'}, 158),
 ({'FRO16142', 'GRO73461'}, 197),
 ({'FRO24098', 'GRO73461'}, 112),
 ({'GRO73461', 'SNA59903'}, 123),
 ({'DAI55911', 'GRO73461'}, 116),
 ({'FRO31317', 'GRO73461'}, 395),
 ({'GRO73461', 'SNA72163'}, 285),
 ({'DAI63921', 'GRO73461'}, 219),
 ({'GRO73461', 'SNA18336'}, 121),
 ({'DAI91290', 'GRO73461'}, 161),
 ({'ELE12792', 'GRO73461'}, 116),
 ({'GRO73461', 'GRO85051'}, 147),
 ({'DAI73122', 'GRO73461'}, 146),
 ({'FRO73056', 'GRO73461'}, 195),
 ({'ELE32164', 'GRO73461'}, 486),
 ({'DAI88807', 'GRO73461'}, 313),
 ({'FRO66272', 'GRO73461'}, 110),
 ({'DAI88079', 'GRO73461'}, 145),
 ({'GRO73461',

In [31]:
frequentItemSets[2]

[({'DAI22896', 'DAI62779', 'GRO73461'}, 101),
 ({'DAI62779', 'FRO31317', 'GRO73461'}, 100),
 ({'DAI88807', 'GRO73461', 'SNA72163'}, 110),
 ({'FRO40251', 'GRO73461', 'GRO85051'}, 147),
 ({'FRO73056', 'GRO44993', 'GRO73461'}, 106),
 ({'ELE32164', 'GRO59710', 'GRO73461'}, 137),
 ({'DAI62779', 'ELE32164', 'GRO73461'}, 131),
 ({'DAI43223', 'ELE32164', 'GRO73461'}, 111),
 ({'DAI88079', 'FRO40251', 'GRO73461'}, 144),
 ({'DAI75645', 'GRO73461', 'SNA80324'}, 230),
 ({'DAI62779', 'GRO73461', 'SNA80324'}, 198),
 ({'FRO40251', 'GRO73461', 'SNA80324'}, 232),
 ({'DAI75645', 'FRO47962', 'GRO73461'}, 111),
 ({'DAI75645', 'ELE17451', 'GRO73461'}, 121),
 ({'DAI62779', 'DAI75645', 'GRO73461'}, 261),
 ({'DAI75645', 'FRO40251', 'GRO73461'}, 293),
 ({'DAI75645', 'GRO21487', 'GRO73461'}, 114),
 ({'DAI75645', 'GRO46854', 'GRO73461'}, 101),
 ({'DAI62779', 'GRO73461', 'SNA96271'}, 114),
 ({'FRO40251', 'GRO56726', 'GRO73461'}, 103),
 ({'DAI62779', 'GRO71621', 'GRO73461'}, 153),
 ({'DAI62779', 'DAI85309', 'GRO734