In [1]:
# https://notebook.community/donaghhorgan/COMP9033/labs/10%20-%20Association%20rule%20mining
import itertools
import pandas as pd
import pyspark

from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import split

In [2]:
# Let's start by importing the packages we'll need. As usual, 
# we'll import pandas for exploratory analysis, but this week we're also going to use pyspark, 
# a Python package that wraps Apache Spark and makes its functionality available in Python. 
# Spark also supports frequent itemset generation using the FPGrowth algorithm, 
# so we'll import this functionality too.

# First, let's initialise a SparkContext object, which will represent our connection to the Spark cluster. 
# To do this, we must first specify the URL of the master node to connect to. 
# As we're only running this notebook for demonstration purposes, 
# we can just run the cluster locally, as follows:

sc = pyspark.SparkContext(master='local[*]')

21/07/23 05:53:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# Note: By specifying master='local[*]', we are instructing Spark to run with as many worker threads 
#     as there are logical cores available on the host machine. Alternatively, 
#     we could directly specify the number of threads, e.g. master='local[4]' to run four threads. 
#     However, we need to make sure to specify at least two threads, so that there is one available for 
#     resource management and at least one available for data processing.

# Spark supports reading from CSV files via its SQLContext object, so let's create this next:

sql = pyspark.SQLContext(sc)

In [4]:
path = 'groceries.csv'

In [5]:
# We can load the data into a Spark DataFrame (similar to a pandas DataFrame)
# using the read.text method of the SQLContext we have created, as follows:
    
df = sql.read.text(path)

In [6]:
# Similar to the head method in pandas, we can peek at the first few rows of the data frame via its show method:

df.show(5, truncate=False)  # Show the first five rows, and don't truncate the printout

[Stage 0:>                                                          (0 + 1) / 1]

+-------------------------------------------------------------------+
|value                                                              |
+-------------------------------------------------------------------+
|citrus fruit,semi-finished bread,margarine,ready soups             |
|tropical fruit,yogurt,coffee                                       |
|whole milk                                                         |
|pip fruit,yogurt,cream cheese,meat spreads                         |
|other vegetables,whole milk,condensed milk,long life bakery product|
+-------------------------------------------------------------------+
only showing top 5 rows



                                                                                

In [7]:
df.take(1)  # Take the first row

[Row(value='citrus fruit,semi-finished bread,margarine,ready soups')]

In [8]:
df = df.select(split('value', ',').alias('items'))  # Split the values column by comma and label the result as 'items'

df.show(truncate=False)

+----------------------------------------------------------------------------------------------+
|items                                                                                         |
+----------------------------------------------------------------------------------------------+
|[citrus fruit, semi-finished bread, margarine, ready soups]                                   |
|[tropical fruit, yogurt, coffee]                                                              |
|[whole milk]                                                                                  |
|[pip fruit, yogurt, cream cheese, meat spreads]                                               |
|[other vegetables, whole milk, condensed milk, long life bakery product]                      |
|[whole milk, butter, yogurt, rice, abrasive cleaner]                                          |
|[rolls/buns]                                                                                  |
|[other vegetables, UHT-milk, 

In [9]:

# Association rule mining

# Next, let's mine our transaction data to find interesting dependencies between itemsets. While there are a number of approaches available for mining frequently occuring itemsets (e.g. Apriori, Eclat), Spark supports the FPGrowth algorithm directly. To run the algorithm on our set of transactions, we need to specify two parameters:

#     minSupport: A minimum support threshold, used to filter out itemsets that don't occur frequently enough.
#     minConfidence: A minimum confidence threshold, used to filter out association rules that aren't strong enough.

# Let's set the minimum support level at 1% and the minimum confidence level at 10%. We can then train a model using the fit method of the FPGrowth class (in a similar way to using scikit-learn), as follows:

algorithm = FPGrowth(minSupport=0.01, minConfidence=0.1)

model = algorithm.fit(df)

[Stage 3:>                                                          (0 + 1) / 1]                                                                                

In [10]:
# We can extract the most frequent itemsets from the model using its freqItemsets attribute, 
# which is just another data frame object that we can call show on:

model.freqItemsets.show(10, truncate=False)



[Stage 7:>                                                          (0 + 1) / 1]

+------------------------------------+----+
|items                               |freq|
+------------------------------------+----+
|[canned vegetables]                 |106 |
|[pork]                              |567 |
|[pork, rolls/buns]                  |111 |
|[pork, other vegetables]            |213 |
|[pork, other vegetables, whole milk]|100 |
|[pork, soda]                        |117 |
|[pork, root vegetables]             |134 |
|[pork, whole milk]                  |218 |
|[frozen fish]                       |115 |
|[roll products]                     |101 |
+------------------------------------+----+
only showing top 10 rows



                                                                                

In [11]:
# We can print the top ten most frequent itemsets by sorting the data frame before calling show, as follows:

model.freqItemsets.sort('freq', ascending=False).show(10, truncate=False)



+------------------+----+
|items             |freq|
+------------------+----+
|[whole milk]      |2513|
|[other vegetables]|1903|
|[rolls/buns]      |1809|
|[soda]            |1715|
|[yogurt]          |1372|
|[bottled water]   |1087|
|[root vegetables] |1072|
|[tropical fruit]  |1032|
|[shopping bags]   |969 |
|[sausage]         |924 |
+------------------+----+
only showing top 10 rows



In [12]:
# We can determing the total number of frequent itemsets found by counting the rows in the data 
# frame via its count method:

model.freqItemsets.count()



333

In [13]:
# As can be seen, the FPGrowth algorithm has identified 332 frequent itemsets in the transaction history.
# We can extract association rules from the model using its associationRules attribute, which is a further 
# data frame object that we can call show on. As above, we can sort the data frame according to the 
# computed confidence level to show the most significant rules first.

model.associationRules.sort('confidence', ascending=False).show(10, truncate=False)



+---------------------------------+------------------+------------------+------------------+--------------------+
|antecedent                       |consequent        |confidence        |lift              |support             |
+---------------------------------+------------------+------------------+------------------+--------------------+
|[citrus fruit, root vegetables]  |[other vegetables]|0.5862068965517241|3.0296084222733612|0.010371123538383325|
|[tropical fruit, root vegetables]|[other vegetables]|0.5845410628019324|3.020999134344196 |0.012302999491611592|
|[curd, yogurt]                   |[whole milk]      |0.5823529411764706|2.27912502048173  |0.010066090493136757|
|[butter, other vegetables]       |[whole milk]      |0.5736040609137056|2.244884973770909 |0.011489578037620742|
|[tropical fruit, root vegetables]|[whole milk]      |0.5700483091787439|2.2309690094599866|0.011997966446365024|
|[root vegetables, yogurt]        |[whole milk]      |0.562992125984252 |2.2033535849801