In [77]:
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.feature import QuantileDiscretizer, VectorAssembler
from pyspark.sql.functions import udf, col, lit, array, array_contains, size
from pyspark.sql.types import StringType

In [63]:
sparkSession = SparkSession.builder.appName("FPGrowth").getOrCreate()

In [64]:
#fichero de ejemplo, no tiene elementos que puedan ser constitutivos de itemsets
df = sparkSession.read.csv("/content/drive/MyDrive/Colab Notebooks/data/grocery_store_data.csv",
                           header=True,
                           inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- ItemID: integer (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- PricePerUnit: double (nullable = true)
 |-- TotalSales: double (nullable = true)

+------+--------+--------+--------+------------+------------------+
|ItemID|ItemName|Category|Quantity|PricePerUnit|        TotalSales|
+------+--------+--------+--------+------------+------------------+
|  1001|  Cheese|  Fruits|      15|        1.64|24.599999999999998|
|  1002|    Milk|  Bakery|      98|        7.02| 687.9599999999999|
|  1003|  Butter|    Meat|      44|       15.81|            695.64|
|  1004|   Bread|    Meat|      49|        14.3|             700.7|
|  1005|  Cheese|  Bakery|      75|       15.88|            1191.0|
+------+--------+--------+--------+------------+------------------+
only showing top 5 rows



In [65]:
discretizer = QuantileDiscretizer(inputCols = ['Quantity','PricePerUnit','TotalSales'],
                                   outputCols = ['Q_Quantity','Q_PricePerUnit','Q_TotalSales'],
                                   numBuckets = 10)
dfDiscrete = discretizer.fit(df).transform(df)
dfDiscrete.show(5)


+------+--------+--------+--------+------------+------------------+----------+--------------+------------+
|ItemID|ItemName|Category|Quantity|PricePerUnit|        TotalSales|Q_Quantity|Q_PricePerUnit|Q_TotalSales|
+------+--------+--------+--------+------------+------------------+----------+--------------+------------+
|  1001|  Cheese|  Fruits|      15|        1.64|24.599999999999998|       1.0|           0.0|         0.0|
|  1002|    Milk|  Bakery|      98|        7.02| 687.9599999999999|       9.0|           3.0|         7.0|
|  1003|  Butter|    Meat|      44|       15.81|            695.64|       4.0|           7.0|         7.0|
|  1004|   Bread|    Meat|      49|        14.3|             700.7|       5.0|           7.0|         7.0|
|  1005|  Cheese|  Bakery|      75|       15.88|            1191.0|       7.0|           7.0|         9.0|
+------+--------+--------+--------+------------+------------------+----------+--------------+------------+
only showing top 5 rows



In [66]:
dfLabeled = dfDiscrete.select('ItemName','Category','Q_Quantity','Q_PricePerUnit','Q_TotalSales')
dfLabeled.show(5)


+--------+--------+----------+--------------+------------+
|ItemName|Category|Q_Quantity|Q_PricePerUnit|Q_TotalSales|
+--------+--------+----------+--------------+------------+
|  Cheese|  Fruits|       1.0|           0.0|         0.0|
|    Milk|  Bakery|       9.0|           3.0|         7.0|
|  Butter|    Meat|       4.0|           7.0|         7.0|
|   Bread|    Meat|       5.0|           7.0|         7.0|
|  Cheese|  Bakery|       7.0|           7.0|         9.0|
+--------+--------+----------+--------------+------------+
only showing top 5 rows



In [67]:
def getLabeledValue(value, featureName):
  formatedValue = str(int(value) if isinstance(value, float) else value)
  labeledValue = f'{featureName}_{formatedValue}'
  return labeledValue[2:] if featureName.startswith('Q_') else labeledValue
#End getLabeledValue
getLabeledValue_udf = udf(getLabeledValue, StringType())
for column in dfLabeled.columns:
    dfLabeled = dfLabeled.withColumn(column, getLabeledValue_udf(col(column), lit(column)))
#End for
dfLabeled.show(5)

+---------------+---------------+----------+--------------+------------+
|       ItemName|       Category|Q_Quantity|Q_PricePerUnit|Q_TotalSales|
+---------------+---------------+----------+--------------+------------+
|ItemName_Cheese|Category_Fruits|Quantity_1|PricePerUnit_0|TotalSales_0|
|  ItemName_Milk|Category_Bakery|Quantity_9|PricePerUnit_3|TotalSales_7|
|ItemName_Butter|  Category_Meat|Quantity_4|PricePerUnit_7|TotalSales_7|
| ItemName_Bread|  Category_Meat|Quantity_5|PricePerUnit_7|TotalSales_7|
|ItemName_Cheese|Category_Bakery|Quantity_7|PricePerUnit_7|TotalSales_9|
+---------------+---------------+----------+--------------+------------+
only showing top 5 rows



In [68]:
dfItems = dfLabeled.withColumn('Items', array(dfLabeled.columns))
dfItems = dfItems.select('Items')
dfItems.show(5, truncate=False)

+----------------------------------------------------------------------------+
|Items                                                                       |
+----------------------------------------------------------------------------+
|[ItemName_Cheese, Category_Fruits, Quantity_1, PricePerUnit_0, TotalSales_0]|
|[ItemName_Milk, Category_Bakery, Quantity_9, PricePerUnit_3, TotalSales_7]  |
|[ItemName_Butter, Category_Meat, Quantity_4, PricePerUnit_7, TotalSales_7]  |
|[ItemName_Bread, Category_Meat, Quantity_5, PricePerUnit_7, TotalSales_7]   |
|[ItemName_Cheese, Category_Bakery, Quantity_7, PricePerUnit_7, TotalSales_9]|
+----------------------------------------------------------------------------+
only showing top 5 rows



In [69]:
fpGrowthDefinition = FPGrowth(itemsCol='Items', minSupport=0.01, minConfidence=0.01)
fpGrowthModel = fpGrowthDefinition.fit(dfItems)

In [70]:
rules = fpGrowthModel.associationRules.show(5, truncate=False)

+---------------------------------+-----------------+------------------+------------------+-------+
|antecedent                       |consequent       |confidence        |lift              |support|
+---------------------------------+-----------------+------------------+------------------+-------+
|[PricePerUnit_5, Category_Fruits]|[TotalSales_8]   |0.3448275862068966|3.4482758620689657|0.01   |
|[TotalSales_8, PricePerUnit_5]   |[Quantity_8]     |0.3448275862068966|3.5549235691432637|0.01   |
|[TotalSales_8, PricePerUnit_5]   |[Quantity_9]     |0.4482758620689655|4.229017566688354 |0.013  |
|[TotalSales_8, PricePerUnit_5]   |[Category_Fruits]|0.3448275862068966|1.2914890869172155|0.01   |
|[TotalSales_1, Category_Bakery]  |[PricePerUnit_0] |0.4230769230769231|4.3171114599686025|0.011  |
+---------------------------------+-----------------+------------------+------------------+-------+
only showing top 5 rows



In [79]:
rules = fpGrowthModel.associationRules.where(col('lift') > 1)\
                                      .where(array_contains(col('consequent'), 'PricePerUnit_0'))\
                                      .where(size(col('antecedent')) == 1)\
                                      .orderBy(col('lift').desc())

rules.show(truncate=False)

+-----------------+----------------+-------------------+------------------+-------+
|antecedent       |consequent      |confidence         |lift              |support|
+-----------------+----------------+-------------------+------------------+-------+
|[TotalSales_0]   |[PricePerUnit_0]|0.5204081632653061 |5.3102873802582256|0.051  |
|[TotalSales_1]   |[PricePerUnit_0]|0.3                |3.061224489795918 |0.03   |
|[TotalSales_2]   |[PricePerUnit_0]|0.16               |1.6326530612244898|0.016  |
|[ItemName_Beef]  |[PricePerUnit_0]|0.14018691588785046|1.4304787335494944|0.015  |
|[Quantity_5]     |[PricePerUnit_0]|0.14               |1.4285714285714286|0.014  |
|[Quantity_1]     |[PricePerUnit_0]|0.1391304347826087 |1.419698314108252 |0.016  |
|[Quantity_0]     |[PricePerUnit_0]|0.13580246913580246|1.385739480977576 |0.011  |
|[ItemName_Bread] |[PricePerUnit_0]|0.1308411214953271 |1.3351134846461947|0.014  |
|[Category_Bakery]|[PricePerUnit_0]|0.11693548387096774|1.1932192231731402|0