In [16]:
!pip install pyspark
!pip install mlflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
# Import the required libraries
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession

In [18]:
# Create a Spark session
spark = SparkSession.builder.appName("FPGrowthExample").getOrCreate()

In [25]:
# Create a sample dataset
data = [("A", ["1", "2", "3"]),
        ("B", ["1", "2", "4"]),
        ("C", ["1", "2", "4"]),
        ("D", ["1", "3", "4"]),
        ("E", ["2", "3", "4"]),
        ("F", ["1", "2", "3", "4"])]

df = spark.createDataFrame(data, ["id", "items"])

In [26]:
# Create an instance of the FPGrowth algorithm
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)

In [27]:
# Fit the model to the dataset
model = fpGrowth.fit(df)

In [28]:
# Generate frequent itemsets
freqItemsets = model.freqItemsets
freqItemsets.show()

+---------+----+
|    items|freq|
+---------+----+
|      [1]|   5|
|   [1, 2]|   4|
|[1, 2, 4]|   3|
|   [1, 4]|   4|
|      [4]|   5|
|      [2]|   5|
|   [2, 4]|   4|
|      [3]|   4|
|   [3, 1]|   3|
|   [3, 2]|   3|
|   [3, 4]|   3|
+---------+----+



In [9]:
# Generate association rules
assocRules = model.associationRules
assocRules.show()

+----------+----------+----------+------------------+------------------+
|antecedent|consequent|confidence|              lift|           support|
+----------+----------+----------+------------------+------------------+
|    [1, 4]|       [2]|      0.75|0.8999999999999999|               0.5|
|    [2, 4]|       [1]|      0.75|0.8999999999999999|               0.5|
|       [2]|       [1]|       0.8|              0.96|0.6666666666666666|
|       [2]|       [4]|       0.8|              0.96|0.6666666666666666|
|       [2]|       [3]|       0.6|               0.9|               0.5|
|       [3]|       [1]|      0.75|0.8999999999999999|               0.5|
|       [3]|       [2]|      0.75|0.8999999999999999|               0.5|
|       [3]|       [4]|      0.75|0.8999999999999999|               0.5|
|    [1, 2]|       [4]|      0.75|0.8999999999999999|               0.5|
|       [4]|       [1]|       0.8|              0.96|0.6666666666666666|
|       [4]|       [2]|       0.8|              0.9

In [10]:
# Transform the original dataset to generate predictions
predictions = model.transform(df)
predictions.show()

+---+------------+----------+
| id|       items|prediction|
+---+------------+----------+
|  A|   [1, 2, 3]|       [4]|
|  B|   [1, 2, 4]|       [3]|
|  C|   [1, 2, 4]|       [3]|
|  D|   [1, 3, 4]|       [2]|
|  E|   [2, 3, 4]|       [1]|
|  F|[1, 2, 3, 4]|        []|
+---+------------+----------+

