# **Apriori Algorithm with NUMBER OF NODE**

In [None]:
!pip install --quiet pyspark

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
from pyspark import SparkContext, SparkConf
import time
def measure_execution_time(input_data_path, min_support_ratio, num_nodes):
    # Configure Spark with the specified number of nodes/cores
    conf = SparkConf().setAppName("MR-Apriori").setMaster(f"local[{num_nodes}]")
    sc = SparkContext(conf=conf)

    # Run the Apriori algorithm and measure time
    start_time = time.time()
    apriori_parallel_algorithm(sc, input_data_path, min_support_ratio)
    elapsed_time = time.time() - start_time

    # Stop the Spark context
    sc.stop()
    return elapsed_time

In [None]:
def generate_candidate_itemsets_from_frequent_sets(frequent_sets, k):
    # Generate candidates from frequent sets
    candidates = []
    for i, set1 in enumerate(frequent_sets):
        for set2 in frequent_sets[i + 1:]:
            # Join itemsets if they share the first k-2 items
            if list(set1)[:k - 2] == list(set2)[:k - 2]:
                candidates.append(set1 | set2)
    return candidates

In [None]:
def find_frequent_sets_in_transactions(sc, candidate_sets, transaction_data, min_support_threshold):
    # Calculate support for each candidate
    def calculate_itemset_support(itemset):
        support_count = sum(1 for transaction in transaction_data if itemset.issubset(transaction))
        return (itemset, support_count) if support_count >= min_support_threshold else None

    # Use parallel processing to find frequent sets
    result_sets = sc.parallelize(candidate_sets).map(calculate_itemset_support).filter(lambda x: x).collect()
    return result_sets


In [None]:
def apriori_parallel_algorithm(sc, input_data_path, min_support_ratio):
    # Start timing
    start_time = time.time()

    # Load and process the dataset
    data_rdd = sc.textFile(input_data_path)
    transaction_rdd = data_rdd.map(lambda line: set(map(int, line.strip().split())))
    num_transactions = transaction_rdd.count()
    min_support_threshold = num_transactions * min_support_ratio

    # Broadcast transactions across workers
    broadcasted_transactions = transaction_rdd.collect()
    all_frequent_itemsets = []
    k = 1

    # Initial candidate 1-itemsets
    candidate_sets = transaction_rdd.flatMap(lambda x: x).distinct().map(lambda x: {x}).collect()

    while candidate_sets:
        print(f"Candidate Sets C{k}: {candidate_sets}")
        frequent_sets_k = find_frequent_sets_in_transactions(sc, candidate_sets, broadcasted_transactions, min_support_threshold)
        print(f"Frequent Sets F{k}: {frequent_sets_k}")
        all_frequent_itemsets.extend(frequent_sets_k)
        k += 1
        candidate_sets = generate_candidate_itemsets_from_frequent_sets([itemset for itemset, _ in frequent_sets_k], k)


    print("Total Time Taken:", time.time() - start_time)

In [None]:
mushroom = "/content/gdrive/MyDrive/Big data/mushroom.dat"


num_nodes = 3  # number of nodes
min_support_ratio = 1

elapsed_time = measure_execution_time(mushroom, min_support_ratio, num_nodes)

# Display the result
print(f"Execution time with {num_nodes} nodes: {elapsed_time:.2f} seconds")

Candidate Sets C1: [{34}, {36}, {38}, {40}, {52}, {54}, {76}, {86}, {90}, {98}, {2}, {14}, {26}, {108}, {114}, {4}, {10}, {16}, {24}, {28}, {94}, {42}, {110}, {44}, {64}, {6}, {56}, {116}, {100}, {60}, {68}, {78}, {46}, {66}, {70}, {18}, {30}, {80}, {58}, {72}, {102}, {112}, {118}, {48}, {20}, {96}, {22}, {32}, {82}, {12}, {8}, {50}, {88}, {104}, {74}, {84}, {92}, {106}, {62}, {1}, {3}, {9}, {13}, {23}, {25}, {59}, {63}, {67}, {85}, {93}, {107}, {113}, {39}, {55}, {99}, {15}, {27}, {41}, {115}, {37}, {53}, {109}, {43}, {11}, {5}, {111}, {57}, {65}, {117}, {45}, {77}, {69}, {17}, {29}, {61}, {79}, {95}, {101}, {71}, {19}, {47}, {91}, {31}, {119}, {103}, {21}, {7}, {81}, {49}, {35}, {73}, {83}, {87}, {51}, {33}, {97}, {105}, {75}, {89}]
Frequent Sets F1: [({85}, 8124)]
Total Time Taken: 2.9113292694091797
Execution time with 3 nodes: 2.92 seconds


In [None]:
mushroom = "/content/gdrive/MyDrive/Big data/mushroom.dat"


num_nodes = 2  # eplace with the desired number of nodes
min_support_ratio = 1  # Example minimum support ratio


elapsed_time = measure_execution_time(mushroom, min_support_ratio, num_nodes)

print(f"Execution time with {num_nodes} nodes: {elapsed_time:.2f} seconds")

Candidate Sets C1: [{34}, {36}, {38}, {40}, {52}, {54}, {76}, {86}, {90}, {98}, {2}, {14}, {26}, {108}, {114}, {4}, {10}, {16}, {24}, {28}, {94}, {42}, {110}, {44}, {64}, {6}, {56}, {116}, {100}, {60}, {68}, {78}, {46}, {66}, {70}, {18}, {30}, {80}, {58}, {72}, {102}, {112}, {118}, {48}, {20}, {96}, {22}, {32}, {82}, {12}, {8}, {50}, {88}, {104}, {74}, {84}, {92}, {106}, {62}, {1}, {3}, {9}, {13}, {23}, {25}, {59}, {63}, {67}, {85}, {93}, {107}, {113}, {39}, {55}, {99}, {15}, {27}, {41}, {115}, {37}, {53}, {109}, {43}, {11}, {5}, {111}, {57}, {65}, {117}, {45}, {77}, {69}, {17}, {29}, {61}, {79}, {95}, {101}, {71}, {19}, {47}, {91}, {31}, {119}, {103}, {21}, {7}, {81}, {49}, {35}, {73}, {83}, {87}, {51}, {33}, {97}, {105}, {75}, {89}]
Frequent Sets F1: [({85}, 8124)]
Total Time Taken: 5.409029245376587
Execution time with 2 nodes: 5.43 seconds


In [None]:
mushroom = "/content/gdrive/MyDrive/Big data/mushroom.dat"


num_nodes = 1  # Number of nodes
min_support_ratio = 0.80


elapsed_time = measure_execution_time(mushroom, min_support_ratio, num_nodes)

# Display the result
print(f"Execution time with {num_nodes} nodes: {elapsed_time:.2f} seconds")

Candidate Sets C1: [{1}, {3}, {9}, {13}, {23}, {25}, {34}, {36}, {38}, {40}, {52}, {54}, {59}, {63}, {67}, {76}, {85}, {86}, {90}, {93}, {98}, {107}, {113}, {2}, {14}, {26}, {39}, {55}, {99}, {108}, {114}, {4}, {15}, {27}, {41}, {115}, {10}, {16}, {24}, {28}, {37}, {53}, {94}, {109}, {42}, {43}, {110}, {44}, {11}, {64}, {5}, {111}, {6}, {56}, {116}, {57}, {65}, {117}, {100}, {60}, {45}, {68}, {77}, {69}, {78}, {46}, {17}, {29}, {61}, {66}, {70}, {79}, {95}, {101}, {71}, {18}, {30}, {80}, {19}, {47}, {58}, {72}, {91}, {102}, {112}, {118}, {31}, {48}, {20}, {96}, {119}, {103}, {21}, {7}, {81}, {22}, {32}, {82}, {12}, {8}, {49}, {35}, {50}, {73}, {83}, {87}, {51}, {88}, {104}, {33}, {74}, {84}, {92}, {97}, {105}, {106}, {62}, {75}, {89}]
Frequent Sets F1: [({34}, 7914), ({36}, 6812), ({85}, 8124), ({86}, 7924), ({90}, 7488)]
Candidate Sets C2: [{34, 36}, {34, 85}, {34, 86}, {34, 90}, {36, 85}, {36, 86}, {90, 36}, {85, 86}, {90, 85}, {90, 86}]
Frequent Sets F2: [({34, 36}, 6602), ({34, 85}