# Discovery of Frequent Itemsets and Association Rules

## Imports

In [1]:
import findspark
findspark.init()

from pyspark import *
from pyspark.sql import *

import os
from IPython.display import display, HTML
import sys
import time
from itertools import chain, combinations

from utils import read_data
from apriori import get_singletons, construct_itemsets_apriori, filter_itemsets_apriori
from association_rules import get_subsets, get_association_rules, pretty_print_association_rules

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# https://graphframes.github.io/graphframes/docs/_site/quick-start.html
# https://stackoverflow.com/questions/65011599/how-to-start-graphframes-on-spark-on-pyspark-on-juypter-on-docker
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell'
os.environ['PYSPARK_PYTHON'] = sys.executable

In [4]:
spark = SparkSession.builder.appName('hw2').master("local[2]").getOrCreate()

21/11/18 22:28:59 WARN Utils: Your hostname, mark-machine resolves to a loopback address: 127.0.1.1; using 192.168.0.102 instead (on interface wlp8s0)
21/11/18 22:28:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/mark/.ivy2/cache
The jars for the packages stored in: /home/mark/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3863408f-c428-4368-8a52-c7fdd92e09fa;1.0
	confs: [default]
	found graphframes#graphframes;0.8.1-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 108ms :: artifacts dl 3ms
	:: modules in use:
	graphframes#graphframes;0.8.1-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-------------------------------------

## Load data

In [5]:
data_path = "data/T10I4D100K.dat"

In [6]:
data_rdd, items_rdd, items_counts_rdd = read_data(data_path=data_path, spark=spark)
data_rdd_c = data_rdd.collect()

                                                                                

In [7]:
print("An example basket:")
print(data_rdd.take(1)[0])

n_baskets = data_rdd.count()
print(f"n_baskets = {n_baskets}")

n_items = items_rdd.count()
print(f"n_items = {n_items}")

print("Example item counts:")
print(items_counts_rdd.take(10))

An example basket:
{448, 834, 164, 775, 328, 687, 240, 368, 274, 561, 52, 630, 730, 825, 538, 25}
n_baskets = 100000


[Stage 3:>                                                          (0 + 2) / 2]

n_items = 870
Example item counts:
[(448, 1370), (834, 1373), (164, 744), (328, 663), (240, 1399), (368, 7828), (274, 2628), (52, 1983), (630, 1523), (538, 3982)]


                                                                                

## Apriori algorithm for finding frequent itemsets with a certain support threshold

In [8]:
# support threshold is 1000, 1% of n_baskets
s = int(n_baskets * 0.01)
print(f"Support threshold = {s}")
# if load intermediate stuff like candidate itemsets and frequent itemsets, set to True
from_ckpt = False

itemsets_frequent_rdd_1 = get_singletons(items_counts_rdd=items_counts_rdd, s=s)
print(f"number of singletons, first frequent itemset lenght={itemsets_frequent_rdd_1.count()} "
      f"with support={s}")
print("Example singletons with their support")
print(itemsets_frequent_rdd_1.take(2))

Support threshold = 1000
number of singletons, first frequent itemset lenght=375 with support=1000
Example singletons with their support
[({448}, 1370), ({834}, 1373)]


In [9]:
k = 2
candidates_2 = construct_itemsets_apriori(k=k, itemsets_frequent_rdd=itemsets_frequent_rdd_1, spark=spark, from_ckpt=from_ckpt)
print(f"proposed candidate itemsets: number={candidates_2.count()} of lenght={k}")
print("example candidates:")
print(candidates_2.take(2))

                                                                                

proposing n=70125 candidates (n_pruned=0)


[Stage 21:>                                                         (0 + 2) / 2]

proposed candidate itemsets: number=70125 of lenght=2
example candidates:
[(0, {413, 494}), (1, {978, 874})]


                                                                                

In [10]:
# this cell can take around 10 minutes
k = 2
new_itemsets_frequent_rdd_2 = \
    filter_itemsets_apriori(candidates_rdd=candidates_2, k=k, s=s, data_rdd_c=data_rdd_c, spark=spark, from_ckpt=from_ckpt)

itemsets_frequent_rdd_2 = itemsets_frequent_rdd_1.union(new_itemsets_frequent_rdd_2)
print(f"new frequent itemsets: number={itemsets_frequent_rdd_2.count()}, "
      f"after adding n={new_itemsets_frequent_rdd_2.count()} filtered candidates of lenght k={k}")
print("Example new frequent items with their support")
print(new_itemsets_frequent_rdd_2.take(2))

Staring filtering...


                                                                                

k=2, t=513.06393886 seconds, n=9
new frequent itemsets: number=384, after adding n=9 filtered candidates of lenght k=2
Example new frequent items with their support
[({368, 829}, 1194), ({722, 390}, 1042)]


In [11]:
k = 3
candidates_3 = construct_itemsets_apriori(k=k, itemsets_frequent_rdd=itemsets_frequent_rdd_2, spark=spark, from_ckpt=from_ckpt)
print(f"proposed candidate itemsets: number={candidates_3.count()} of lenght={k}")
print("example candidates:")
print(candidates_3.take(2))

21/11/18 22:37:56 WARN TaskSetManager: Stage 31 contains a task of very large size (1007 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

proposing n=1 candidates (n_pruned=3351)
proposed candidate itemsets: number=1 of lenght=3
example candidates:
[(0, {704, 825, 39})]


In [12]:
k = 3
new_itemsets_frequent_rdd_3 = \
    filter_itemsets_apriori(candidates_rdd=candidates_3, k=k, s=s, data_rdd_c=data_rdd_c, spark=spark, from_ckpt=from_ckpt)

itemsets_frequent_rdd_3 = itemsets_frequent_rdd_2.union(new_itemsets_frequent_rdd_3)
print(f"new frequent itemsets: number={itemsets_frequent_rdd_3.count()}, "
      f"after adding n={new_itemsets_frequent_rdd_3.count()} filtered candidates of lenght k={k}")
print("Example new frequent items with their support")
print(new_itemsets_frequent_rdd_3.take(2))

Staring filtering...
k=3, t=0.50488234 seconds, n=1
new frequent itemsets: number=385, after adding n=1 filtered candidates of lenght k=3
Example new frequent items with their support
[({704, 825, 39}, 1035)]


In [13]:
k = 4
candidates_4 = construct_itemsets_apriori(k=k, itemsets_frequent_rdd=itemsets_frequent_rdd_3, spark=spark, from_ckpt=False)
print(f"proposed candidate itemsets: number={candidates_4.count()} of lenght={k}")
print("example candidates:")
print(candidates_4.take(2))

21/11/18 22:38:00 WARN TaskSetManager: Stage 54 contains a task of very large size (1007 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

proposing n=0 candidates (n_pruned=372)
proposed candidate itemsets: number=0 of lenght=4
example candidates:
[]


## Mining association rules from frequent itemsets with a certain support and confidence

In [14]:
print(f"number of frequent itemsets = {itemsets_frequent_rdd_3.count()}")

number of frequent itemsets = 385


In [15]:
itemsets_frequent_rdd_3.sample(withReplacement=False, fraction=0.1).take(10)

[({964}, 1518),
 ({708}, 1090),
 ({214}, 1893),
 ({208}, 1483),
 ({266}, 1022),
 ({458}, 1124),
 ({888}, 3686),
 ({334}, 2146),
 ({638}, 2288),
 ({790}, 1094)]

In [16]:
# condidence threshold is 50%
c_thresh = 0.5
itemsets_frequent = itemsets_frequent_rdd_3.collect()

association_rules = get_association_rules(itemsets_frequent=itemsets_frequent, c_thresh=c_thresh)
pretty_print_association_rules(association_rules)

frequent itemset: {368, 829}

frequent itemset: {722, 390}

frequent itemset: {789, 829}

frequent itemset: {704, 825}
	association rule: {704} -> {825} with confidence=0.6143

frequent itemset: {704, 39}
	association rule: {704} -> {39} with confidence=0.6171

frequent itemset: {227, 390}
	association rule: {227} -> {390} with confidence=0.5770

frequent itemset: {368, 682}

frequent itemset: {217, 346}

frequent itemset: {825, 39}

frequent itemset: {704, 825, 39}
	association rule: {704} -> {825, 39} with confidence=0.5769
	association rule: {704, 825} -> {39} with confidence=0.9392
	association rule: {704, 39} -> {825} with confidence=0.9350
	association rule: {825, 39} -> {704} with confidence=0.8719

