In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import the required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth

In [3]:
# Create a Spark session
spark = SparkSession.builder \
    .master("local") \
    .appName("Frequent Itemset") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.options(header = 'True', delimiter = ';').csv("/content/sample_data/market-basket.csv")
df.show()

+------+--------------------+--------+----------------+-----+----------+--------------+
|BillNo|            Itemname|Quantity|            Date|Price|CustomerID|       Country|
+------+--------------------+--------+----------------+-----+----------+--------------+
|536365|WHITE HANGING HEA...|       6|01.12.2010 08:26| 2,55|     17850|United Kingdom|
|536365| WHITE METAL LANTERN|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|CREAM CUPID HEART...|       8|01.12.2010 08:26| 2,75|     17850|United Kingdom|
|536365|KNITTED UNION FLA...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|RED WOOLLY HOTTIE...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|SET 7 BABUSHKA NE...|       2|01.12.2010 08:26| 7,65|     17850|United Kingdom|
|536365|GLASS STAR FROSTE...|       6|01.12.2010 08:26| 4,25|     17850|United Kingdom|
|536366|HAND WARMER UNION...|       6|01.12.2010 08:28| 1,85|     17850|United Kingdom|
|536366|HAND WARMER RED P...|   

In [4]:
# Only take two columns for the FP-Growth that have significance effect to the prediction: 
#BillNo because we want to know what items bought in each transaction and Itemname because we want to know the details of the items bought in each transaction
de = df.select("BillNo", "Itemname")
de.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|WHITE HANGING HEA...|
|536365| WHITE METAL LANTERN|
|536365|CREAM CUPID HEART...|
|536365|KNITTED UNION FLA...|
|536365|RED WOOLLY HOTTIE...|
|536365|SET 7 BABUSHKA NE...|
|536365|GLASS STAR FROSTE...|
|536366|HAND WARMER UNION...|
|536366|HAND WARMER RED P...|
|536367|ASSORTED COLOUR B...|
|536367|POPPY'S PLAYHOUSE...|
|536367|POPPY'S PLAYHOUSE...|
|536367|FELTCRAFT PRINCES...|
|536367|IVORY KNITTED MUG...|
|536367|BOX OF 6 ASSORTED...|
|536367|BOX OF VINTAGE JI...|
|536367|BOX OF VINTAGE AL...|
|536367|HOME BUILDING BLO...|
|536367|LOVE BUILDING BLO...|
|536367|RECIPE BOX WITH M...|
+------+--------------------+
only showing top 20 rows



In [5]:
# Group by the BillNo, so every items bought in a transaction will be gathered togetherin a row of data
dh = de.groupBy("BillNo").agg(collect_list('Itemname').alias('Itemname'))
dh.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|[WHITE HANGING HE...|
|536366|[HAND WARMER UNIO...|
|536367|[ASSORTED COLOUR ...|
|536368|[JAM MAKING SET W...|
|536369|[BATH BUILDING BL...|
|536370|[ALARM CLOCK BAKE...|
|536371|[PAPER CHAIN KIT ...|
|536372|[HAND WARMER RED ...|
|536373|[WHITE HANGING HE...|
|536374|[VICTORIAN SEWING...|
|536375|[WHITE HANGING HE...|
|536376|[HOT WATER BOTTLE...|
|536377|[HAND WARMER RED ...|
|536378|[JUMBO BAG PINK P...|
|536380|[JAM MAKING SET P...|
|536381|[RETROSPOT TEA SE...|
|536382|[INFLATABLE POLIT...|
|536384|[WOOD BLACK BOARD...|
|536385|[SET 3 WICKER OVA...|
|536386|[WHITE WIRE EGG H...|
+------+--------------------+
only showing top 20 rows



In [9]:
# List of minSupport and minConfidence values
min_support= [0.1, 0.5, 0.6, 0.8]
min_confidence = [0.1, 0.6, 0.8]

In [10]:
# The string should be converted to array
dh_new = dh.withColumn("Itemname", array(dh["Itemname"]))
dh_new.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|[[WHITE HANGING H...|
|536366|[[HAND WARMER UNI...|
|536367|[[ASSORTED COLOUR...|
|536368|[[JAM MAKING SET ...|
|536369|[[BATH BUILDING B...|
|536370|[[ALARM CLOCK BAK...|
|536371|[[PAPER CHAIN KIT...|
|536372|[[HAND WARMER RED...|
|536373|[[WHITE HANGING H...|
|536374|[[VICTORIAN SEWIN...|
|536375|[[WHITE HANGING H...|
|536376|[[HOT WATER BOTTL...|
|536377|[[HAND WARMER RED...|
|536378|[[JUMBO BAG PINK ...|
|536380|[[JAM MAKING SET ...|
|536381|[[RETROSPOT TEA S...|
|536382|[[INFLATABLE POLI...|
|536384|[[WOOD BLACK BOAR...|
|536385|[[SET 3 WICKER OV...|
|536386|[[WHITE WIRE EGG ...|
+------+--------------------+
only showing top 20 rows



In [13]:
# Loop for different minSupport and minConfidence value
for x in min_support:
  for y in min_confidence:
    print(f"minSupport: {x}, minConfidence: {y}")
    fpGrowth = FPGrowth(itemsCol="Itemname", minSupport=x, minConfidence=y)
    model = fpGrowth.fit(dh_new)

    # Display frequent itemsets.
    model.freqItemsets.show()

    # Display generated association rules.
    #model.associationRules.show()

    # transform examines the input items against all the association rules and summarize the
    # consequents as prediction
    #model.transform(dh_new).show()

    # Frequent Itemset
    #model.freqItemsets.show()
    # Display generated association rules.
    model.associationRules.show(20,False)

minSupport: 0.1, minConfidence: 0.1
+-----+----+
|items|freq|
+-----+----+
| [[]]| 113|
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

minSupport: 0.1, minConfidence: 0.6
+-----+----+
|items|freq|
+-----+----+
| [[]]| 113|
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

minSupport: 0.1, minConfidence: 0.8
+-----+----+
|items|freq|
+-----+----+
| [[]]| 113|
+-----+----+

+----------+----------+----------+----+-------+
|antecedent|consequent|confidence|lift|support|
+----------+----------+----------+----+-------+
+----------+----------+----------+----+-------+

minSupport: 0.5, minConfidence: 0.1
+-----+----+
|items|freq|
+-----+----+
+-----+----+

+----------+----------+---

**Conclusion**
In conclusion, the FP-Growth program hasn't worked well yet.