# Homework2

### Mandatory Task:
You are to solve the first sub-problem: to implement the A-Priori algorithm for finding frequent itemsets with support at least s in a dataset of sales transactions. Remind that support of an itemset is the number of transactions containing the itemset. To test and evaluate your implementation, write a program that uses your A-Priori algorithm implementation to discover frequent itemsets with support at least s in a given dataset of sales transactions.

In [1]:
from pyspark import SparkContext, SparkConf
import findspark
from itertools import combinations

In [2]:
# initializing Spark
findspark.init()
conf = SparkConf().setAppName("FreqItemSets").setMaster("local[*]")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/11/14 20:16:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
transactions = sc.textFile("datasets/transaction_dataset2.txt").map(lambda line: line.strip().split(" "))
print(transactions.first())

[Stage 0:>                                                          (0 + 1) / 1]

['1', '2', '3', '4']


                                                                                

In [4]:
# Parameters:
s = 0.2 # support threshold - 0.018 there is 3-itemsets as well, but running time is too much for me
num_of_transactions = transactions.count()
frequency_threshold = s * num_of_transactions # how many times should the itemset apper to be frequent
c = 0.6
print(frequency_threshold)

2.6


In [5]:
def generate_candidates(prev_freq_itemsets, freq_1_itemsets, k):
    """Generate candidate k-itemsets by pairing (k-1)-itemsets with 1-itemsets."""
    print(f"Prev freq itemsets: {prev_freq_itemsets}")
    print(f"1 freq itemsets: {freq_1_itemsets}")
    print(f"generate candidates for k={k}" )
    candidates = set()
    
    for itemset in prev_freq_itemsets:
        for item in freq_1_itemsets:
            # Create a new candidate by adding the 1-itemset to the (k-1)-itemset
            candidate = itemset | item
            
            # Only add if the resulting candidate has exactly k items
            if len(candidate) == k:
                candidates.add(candidate)
                
    print(candidates)
    
    return candidates


In [6]:
def prune_candidates(candidates, prev_freq_itemsets):
    """Prune candidate k-itemsets by removing those with infrequent (k-1)-itemset subsets."""
    pruned_candidates = set()
    
    for candidate in candidates:
        is_valid = True
        # Generate all possible (k-1)-itemsets by removing one item at a time
        for item in candidate:
            subset = candidate - frozenset([item])
            # Check if the subset is in the frequent (k-1)-itemsets
            if subset not in prev_freq_itemsets:
                is_valid = False
                break
        # If all (k-1)-subsets are frequent, add candidate to the pruned set
        if is_valid:
            pruned_candidates.add(candidate)
            
    print(f"Pruned candidates: {pruned_candidates}")
    
    return pruned_candidates


In [7]:
freq_itemsets = dict()
s_count = dict()

item_appear = transactions.flatMap(lambda items: [(item, 1) for item in items]).reduceByKey(lambda x, y: x+y)
freq_1_itemsets_tuples = item_appear.filter(lambda item: item[1] >= frequency_threshold).map(lambda item: (frozenset([item[0]]), item[1])).collect()

freq_1_itemsets = set()

for itemset, count in freq_1_itemsets_tuples:
    freq_itemsets.setdefault(1, set()).add(itemset)
    s_count[itemset] = count
    freq_1_itemsets.update(itemset)

print(len(freq_1_itemsets))
print(s_count)

5
{frozenset({'1'}): 10, frozenset({'4'}): 5, frozenset({'2'}): 8, frozenset({'3'}): 4, frozenset({'5'}): 5}


In [8]:
# Convert individual items in freq_1_itemsets to frozensets
freq_1_itemsets = set(frozenset([item]) for item in freq_1_itemsets)
# Collect frequent 1-itemsets as frozensets of individual items
#freq_1_itemsets = set(item for item, count in freq_1_itemsets_tuples if count >= frequency_threshold)
# Extract individual string items from frozenset
#freq_1_itemsets = set(next(iter(itemset)) for itemset, count in freq_1_itemsets_tuples if count >= frequency_threshold)
print(f"freq_1_itemsets: {freq_1_itemsets}")


freq_1_itemsets: {frozenset({'5'}), frozenset({'3'}), frozenset({'1'}), frozenset({'2'}), frozenset({'4'})}


In [9]:
k=2

while True:
    candidates_k = generate_candidates(freq_itemsets[k-1], freq_1_itemsets, k)
    if not candidates_k:
        break
    
    #Pruning:
    candidates_k = prune_candidates(candidates_k, freq_itemsets[k-1])
    if not candidates_k:
        break
    
    candidates_k_rdd = sc.parallelize(list(candidates_k)).map(lambda item: (item, 1)) # every element is in (itemset, 1) format
    
    transaction_k_itemsets = transactions.flatMap(lambda transaction: [frozenset(combo) for combo in combinations(transaction, k)]).map(lambda x: (x, 1)) # contains the subsets as well from candidates_k_rdd in the format (itemset, 1)
        
    # Join candidate itemsets with transaction itemsets to count support
    #candidate_counts = candidates_k_rdd.join(transaction_k_itemsets).map(lambda x: (x[0], x[1][0] + x[1][1])).reduceByKey(lambda a, b: a + b)
    # Join candidate itemsets with transaction itemsets to count support
    #candidate_counts = candidates_k_rdd.join(transaction_k_itemsets).map(lambda x: (x[0], 1)).reduceByKey(lambda a, b: a + b)

    
    #freq_k_itemsets = candidate_counts.filter(lambda x: x[1] >= frequency_threshold).collect()
    
    transaction_k_counts = transaction_k_itemsets.reduceByKey(lambda a, b: a + b)
    candidate_counts = candidates_k_rdd.join(transaction_k_counts).map(lambda x: (x[0], x[1][1]))
    freq_k_itemsets = candidate_counts.filter(lambda x: x[1] >= frequency_threshold).collect()

        
    if not freq_k_itemsets:
        break

    freq_itemsets[k] = set()
    for itemset, count in freq_k_itemsets:
        freq_itemsets[k].add(itemset)
        s_count[itemset] = count
    print(f"k={k}: {freq_itemsets[k]}")
    k += 1

Prev freq itemsets: {frozenset({'5'}), frozenset({'3'}), frozenset({'1'}), frozenset({'2'}), frozenset({'4'})}
1 freq itemsets: {frozenset({'5'}), frozenset({'3'}), frozenset({'1'}), frozenset({'2'}), frozenset({'4'})}
generate candidates for k=2
{frozenset({'5', '1'}), frozenset({'5', '3'}), frozenset({'3', '1'}), frozenset({'5', '4'}), frozenset({'5', '2'}), frozenset({'2', '4'}), frozenset({'2', '1'}), frozenset({'2', '3'}), frozenset({'1', '4'}), frozenset({'3', '4'})}
Pruned candidates: {frozenset({'5', '1'}), frozenset({'5', '3'}), frozenset({'3', '1'}), frozenset({'5', '4'}), frozenset({'5', '2'}), frozenset({'2', '4'}), frozenset({'2', '1'}), frozenset({'2', '3'}), frozenset({'1', '4'}), frozenset({'3', '4'})}
k=2: {frozenset({'5', '1'}), frozenset({'5', '4'}), frozenset({'2', '4'}), frozenset({'2', '1'}), frozenset({'2', '3'}), frozenset({'1', '4'}), frozenset({'3', '4'})}
Prev freq itemsets: {frozenset({'5', '1'}), frozenset({'5', '4'}), frozenset({'2', '4'}), frozenset({'2',

In [10]:
print(freq_itemsets)

{1: {frozenset({'5'}), frozenset({'3'}), frozenset({'1'}), frozenset({'2'}), frozenset({'4'})}, 2: {frozenset({'5', '1'}), frozenset({'5', '4'}), frozenset({'2', '4'}), frozenset({'2', '1'}), frozenset({'2', '3'}), frozenset({'1', '4'}), frozenset({'3', '4'})}, 3: {frozenset({'2', '3', '4'})}}


In [11]:
all_frequent_itemsets = []
for size, itemsets in freq_itemsets.items():
    for itemset in itemsets:
        all_frequent_itemsets.append((frozenset(itemset), size, s_count[itemset]))

all_frequent_itemsets = sorted(all_frequent_itemsets, key=lambda x: (x[1], sorted(x[0])))


In [12]:
print("\nAll Frequent Itemsets:")
for itemset, size, count in all_frequent_itemsets:
    print(f"Itemset: {set(itemset)}, Size: {size}, Support Count: {count}")




All Frequent Itemsets:
Itemset: {'1'}, Size: 1, Support Count: 10
Itemset: {'2'}, Size: 1, Support Count: 8
Itemset: {'3'}, Size: 1, Support Count: 4
Itemset: {'4'}, Size: 1, Support Count: 5
Itemset: {'5'}, Size: 1, Support Count: 5
Itemset: {'2', '1'}, Size: 2, Support Count: 5
Itemset: {'1', '4'}, Size: 2, Support Count: 3
Itemset: {'5', '1'}, Size: 2, Support Count: 3
Itemset: {'2', '3'}, Size: 2, Support Count: 3
Itemset: {'2', '4'}, Size: 2, Support Count: 3
Itemset: {'3', '4'}, Size: 2, Support Count: 3
Itemset: {'5', '4'}, Size: 2, Support Count: 3
Itemset: {'2', '3', '4'}, Size: 3, Support Count: 3


### Association Rule Generation

In [13]:
association_rules = []

for itemset, size, count in all_frequent_itemsets:
    if size < 2:
        continue
    # Generate all non-empty proper subsets of the itemset
    subsets = []
    for r in range(1, size):
        subsets.extend(combinations(itemset, r))
    
    # Process each subset to form rules
    for subset in subsets:
        antecedent = frozenset(subset)
        consequent = itemset - antecedent
        if not consequent:
            continue
        # Retrieve support counts
        support_itemset = int(count)
        support_antecedent = int(s_count.get(antecedent, 0))
        if support_antecedent == 0:
            continue  # Avoid division by zero
        confidence = support_itemset / support_antecedent
        if confidence >= c:
            association_rules.append((set(antecedent), set(consequent), support_itemset, support_antecedent, confidence))



In [None]:
association_rules = sorted(association_rules, key=lambda x: x[4], reverse=True)

for antecedent, consequent, support_S, support_A, confidence in association_rules:
    print(f"Rule: {antecedent} -> {consequent}, Support(L): {support_S}, Support(A): {support_A}, Confidence: {confidence:.2f}")


Rule: {'2', '3'} -> {'4'}, Support(L): 3, Support(A): 3, Confidence: 1.00
Rule: {'2', '4'} -> {'3'}, Support(L): 3, Support(A): 3, Confidence: 1.00
Rule: {'3', '4'} -> {'2'}, Support(L): 3, Support(A): 3, Confidence: 1.00
Rule: {'3'} -> {'2'}, Support(L): 3, Support(A): 4, Confidence: 0.75
Rule: {'3'} -> {'4'}, Support(L): 3, Support(A): 4, Confidence: 0.75
Rule: {'3'} -> {'2', '4'}, Support(L): 3, Support(A): 4, Confidence: 0.75
Rule: {'2'} -> {'1'}, Support(L): 5, Support(A): 8, Confidence: 0.62
Rule: {'4'} -> {'1'}, Support(L): 3, Support(A): 5, Confidence: 0.60
Rule: {'5'} -> {'1'}, Support(L): 3, Support(A): 5, Confidence: 0.60
Rule: {'4'} -> {'2'}, Support(L): 3, Support(A): 5, Confidence: 0.60
Rule: {'4'} -> {'3'}, Support(L): 3, Support(A): 5, Confidence: 0.60
Rule: {'5'} -> {'4'}, Support(L): 3, Support(A): 5, Confidence: 0.60
Rule: {'4'} -> {'5'}, Support(L): 3, Support(A): 5, Confidence: 0.60
Rule: {'4'} -> {'2', '3'}, Support(L): 3, Support(A): 5, Confidence: 0.60
