# Homework2

### Mandatory Task:
You are to solve the first sub-problem: to implement the A-Priori algorithm for finding frequent itemsets with support at least s in a dataset of sales transactions. Remind that support of an itemset is the number of transactions containing the itemset. To test and evaluate your implementation, write a program that uses your A-Priori algorithm implementation to discover frequent itemsets with support at least s in a given dataset of sales transactions.

In [1]:
from pyspark import SparkContext, SparkConf
import findspark
from itertools import combinations

In [2]:
# initializing Spark
findspark.init()
conf = SparkConf().setAppName("FreqItemSets").setMaster("local[*]")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/11/12 18:35:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
transactions = sc.textFile("datasets/transaction_dataset.txt").map(lambda line: line.strip().split(" "))
print(transactions.first())

[Stage 0:>                                                          (0 + 1) / 1]

['25', '52', '164', '240', '274', '328', '368', '448', '538', '561', '630', '687', '730', '775', '825', '834']


                                                                                

In [4]:
# Parameters:
s = 0.022 # support threshold - 0.018 there is 3-itemsets as well, but running time is too much for me
num_of_transactions = transactions.count()
frequency_threshold = s * num_of_transactions # how many times should the itemset apper to be frequent
print(frequency_threshold)

333.036


In [5]:
def generate_candidates(prev_freq_itemsets, freq_1_itemsets, k):
    """Generate candidate k-itemsets by pairing (k-1)-itemsets with 1-itemsets."""
    candidates = set()
    
    for itemset in prev_freq_itemsets:
        for item in freq_1_itemsets:
            # Create a new candidate by adding the 1-itemset to the (k-1)-itemset
            candidate = itemset | frozenset([item])
            
            # Only add if the resulting candidate has exactly k items
            if len(candidate) == k:
                candidates.add(candidate)
    
    return candidates


In [6]:
def prune_candidates(candidates, prev_freq_itemsets):
    """Prune candidate k-itemsets by removing those with infrequent (k-1)-itemset subsets."""
    pruned_candidates = set()
    
    for candidate in candidates:
        is_valid = True
        # Generate all possible (k-1)-itemsets by removing one item at a time
        for item in candidate:
            subset = candidate - frozenset([item])
            # Check if the subset is in the frequent (k-1)-itemsets
            if subset not in prev_freq_itemsets:
                is_valid = False
                break
        # If all (k-1)-subsets are frequent, add candidate to the pruned set
        if is_valid:
            pruned_candidates.add(candidate)
    
    return pruned_candidates


In [7]:
freq_itemsets = dict()

item_appear = transactions.flatMap(lambda items: [(item, 1) for item in items]).reduceByKey(lambda x, y: x+y)
freq_1_itemsets = item_appear.filter(lambda item: item[1] >= frequency_threshold).map(lambda item: item[0]).collect()
print(len(freq_1_itemsets))

[Stage 2:>                                                          (0 + 2) / 2]

130


                                                                                

In [8]:
freq_1_itemsets = set(freq_1_itemsets)

In [9]:
freq_k_itemsets = set(frozenset([item]) for item in freq_1_itemsets)
freq_itemsets[1] = freq_k_itemsets

In [10]:
k=2

while True:
    candidates_k = generate_candidates(freq_itemsets[k-1], freq_1_itemsets, k)
    if not candidates_k:
        break
    
    #Pruning:
    candidates_k = prune_candidates(candidates_k, freq_itemsets[k-1])
    if not candidates_k:
        break
    
    candidates_k_rdd = sc.parallelize(list(candidates_k)).map(lambda item: (item, 1))
    
    transaction_k_itemsets = transactions.flatMap(lambda transaction: [frozenset(combo) for combo in combinations(transaction, k)]).map(lambda x: (x, 1))
        
    # Join candidate itemsets with transaction itemsets to count support
    candidate_counts = candidates_k_rdd.join(transaction_k_itemsets).map(lambda x: (x[0], x[1][0] + x[1][1])).reduceByKey(lambda a, b: a + b)
    
    freq_k_itemsets = candidate_counts.filter(lambda x: x[1] >= frequency_threshold).map(lambda x: x[0]).collect()
        
    freq_k_itemsets = set(freq_k_itemsets)
    if not freq_k_itemsets:
        break

    freq_itemsets[k] = freq_k_itemsets
    print(f"Frequent {k}-itemsets: {freq_itemsets[k]}")
    k += 1



Frequent 2-itemsets: {frozenset({'39', '825'}), frozenset({'789', '829'}), frozenset({'346', '217'}), frozenset({'829', '368'}), frozenset({'368', '682'})}


                                                                                

In [11]:
all_frequent_itemsets = []
for size, itemsets in freq_itemsets.items():
    for itemset in itemsets:
        all_frequent_itemsets.append((itemset, size))

all_frequent_itemsets = sorted(all_frequent_itemsets, key=lambda x: (x[1], sorted(x[0])))


In [12]:
print("\nAll Frequent Itemsets:")
for itemset, size in all_frequent_itemsets:
    print(f"Itemset: {set(itemset)}, Size: {size}")



All Frequent Itemsets:
Itemset: {'112'}, Size: 1
Itemset: {'12'}, Size: 1
Itemset: {'120'}, Size: 1
Itemset: {'132'}, Size: 1
Itemset: {'140'}, Size: 1
Itemset: {'145'}, Size: 1
Itemset: {'151'}, Size: 1
Itemset: {'161'}, Size: 1
Itemset: {'175'}, Size: 1
Itemset: {'177'}, Size: 1
Itemset: {'183'}, Size: 1
Itemset: {'204'}, Size: 1
Itemset: {'205'}, Size: 1
Itemset: {'21'}, Size: 1
Itemset: {'217'}, Size: 1
Itemset: {'236'}, Size: 1
Itemset: {'239'}, Size: 1
Itemset: {'242'}, Size: 1
Itemset: {'27'}, Size: 1
Itemset: {'274'}, Size: 1
Itemset: {'276'}, Size: 1
Itemset: {'279'}, Size: 1
Itemset: {'283'}, Size: 1
Itemset: {'285'}, Size: 1
Itemset: {'296'}, Size: 1
Itemset: {'32'}, Size: 1
Itemset: {'346'}, Size: 1
Itemset: {'350'}, Size: 1
Itemset: {'354'}, Size: 1
Itemset: {'362'}, Size: 1
Itemset: {'368'}, Size: 1
Itemset: {'38'}, Size: 1
Itemset: {'381'}, Size: 1
Itemset: {'39'}, Size: 1
Itemset: {'390'}, Size: 1
Itemset: {'392'}, Size: 1
Itemset: {'401'}, Size: 1
Itemset: {'413'}, Si