# Generating Frequent Itemsets and Association Rules using FP-Growth

## Initialization

In [1]:
# import findspark
import findspark
findspark.init()

In [2]:
# import SparkSession
from pyspark.sql import SparkSession

In [3]:
# import FPGrowth
from pyspark.ml.fpm import FPGrowth

In [4]:
# import functions
from pyspark.sql import functions as F

In [5]:
# create session
spark = SparkSession \
    .builder \
    .appName("FPGrowthExample") \
    .getOrCreate()

In [6]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000002B89F41EB8>


In [7]:
# read dataset
df = spark.read.csv("olist_order_items_dataset.csv", header=True, inferSchema=True)

In [8]:
# find out attributes
df.head()

Row(order_id='00010242fe8c5a6d1ba2dd792cb16214', order_item_id=1, product_id='4244733e06e7ecb4970a6e2683c13e61', seller_id='48436dade18ac8b2bce089ec2a041202', shipping_limit_date=datetime.datetime(2017, 9, 19, 9, 45, 35), price=58.9, freight_value=13.29)

In [9]:
# find out each schema
df.schema

StructType(List(StructField(order_id,StringType,true),StructField(order_item_id,IntegerType,true),StructField(product_id,StringType,true),StructField(seller_id,StringType,true),StructField(shipping_limit_date,TimestampType,true),StructField(price,DoubleType,true),StructField(freight_value,DoubleType,true)))

## Pre-Processing

In [10]:
# drop unnecessary columns
df = df.drop('order_item_id','seller_id','shipping_limit_date','price','freight_value')

In [11]:
df.show()

+--------------------+--------------------+
|            order_id|          product_id|
+--------------------+--------------------+
|00010242fe8c5a6d1...|4244733e06e7ecb49...|
|00018f77f2f0320c5...|e5f2d52b802189ee6...|
|000229ec398224ef6...|c777355d18b72b67a...|
|00024acbcdf0a6daa...|7634da152a4610f15...|
|00042b26cf59d7ce6...|ac6c3623068f30de0...|
|00048cc3ae777c65d...|ef92defde845ab845...|
|00054e8431b9d7675...|8d4f2bb7e93e6710a...|
|000576fe39319847c...|557d850972a7d6f79...|
|0005a1a1728c9d785...|310ae3c140ff94b03...|
|0005f50442cb953dc...|4535b0e1091c278df...|
|00061f2a7bc09da83...|d63c1011f49d98b97...|
|00063b381e2406b52...|f177554ea93259a5b...|
|0006ec9db01a64e59...|99a4788cb24856965...|
|0008288aa423d2a3f...|368c6c730842d7801...|
|0008288aa423d2a3f...|368c6c730842d7801...|
|0009792311464db53...|8cab8abac59158715...|
|0009c9a17f916a706...|3f27ac8e699df3d30...|
|000aed2e25dbad2f9...|4fa33915031a8cde0...|
|000c3e6612759851c...|b50c950aba0dcead2...|
|000e562887b1f2006...|5ed9eaf534

### Generating Mapping Dataset

Because both the values of product_id are too long, we're going to shorten those by joining it with products original dataset.

In [12]:
# read products dataset
df_product = spark.read.csv("olist_products_dataset.csv", header=True, inferSchema=True)

In [13]:
df_product.head()

Row(product_id='1e9e8ef04dbcff4541ed26657ea517e5', product_category_name='perfumaria', product_name_lenght=40, product_description_lenght=287, product_photos_qty=1, product_weight_g=225, product_length_cm=16, product_height_cm=10, product_width_cm=14)

In [14]:
# since there's no 'product_name', we will take necessary column (product_id) only
df_product = df_product.select('product_id')

In [15]:
# we're adding new column 'new_product_id' that will act as a new product_id
df_product = df_product.withColumn('new_product_id', F.monotonically_increasing_id())

In [16]:
df_product.show()

+--------------------+--------------+
|          product_id|new_product_id|
+--------------------+--------------+
|1e9e8ef04dbcff454...|             0|
|3aa071139cb16b67c...|             1|
|96bd76ec8810374ed...|             2|
|cef67bcfe19066a93...|             3|
|9dc1a7de274444849...|             4|
|41d3672d4792049fa...|             5|
|732bd381ad09e530f...|             6|
|2548af3e6e77a690c...|             7|
|37cc742be07708b53...|             8|
|8c92109888e8cdf9d...|             9|
|14aa47b7fe5c25522...|            10|
|03b63c5fc16691530...|            11|
|cf55509ea8edaaac1...|            12|
|7bb6f29c2be577161...|            13|
|eb31436580a610f20...|            14|
|3bb7f144022e67327...|            15|
|6a2fb4dd53d2cdb88...|            16|
|a1b71017a84f92fd8...|            17|
|a0736b92e52f6cead...|            18|
|f53103a77d9cf245e...|            19|
+--------------------+--------------+
only showing top 20 rows



In [17]:
df_product.count()

32951

In [18]:
df_product.filter(df_product['new_product_id'] > 32949).show()

+--------------------+--------------+
|          product_id|new_product_id|
+--------------------+--------------+
|106392145fca36341...|         32950|
+--------------------+--------------+



### Joining Datasets

In [19]:
# join df and df_product on 'product_id' as the key, then return 'order_id'
# and 'new_product_id'
df = df.join(df_product, df.product_id == df_product.product_id).select('order_id', 'new_product_id')

In [20]:
df.show()

+--------------------+--------------+
|            order_id|new_product_id|
+--------------------+--------------+
|00010242fe8c5a6d1...|         25865|
|00018f77f2f0320c5...|         27230|
|000229ec398224ef6...|         22624|
|00024acbcdf0a6daa...|         15403|
|00042b26cf59d7ce6...|          8862|
|00048cc3ae777c65d...|          3939|
|00054e8431b9d7675...|         22292|
|000576fe39319847c...|          6974|
|0005a1a1728c9d785...|          2713|
|0005f50442cb953dc...|         28255|
|00061f2a7bc09da83...|         14394|
|00063b381e2406b52...|         23293|
|0006ec9db01a64e59...|          9661|
|0008288aa423d2a3f...|          4598|
|0008288aa423d2a3f...|          4598|
|0009792311464db53...|          1131|
|0009c9a17f916a706...|          9418|
|000aed2e25dbad2f9...|         30285|
|000c3e6612759851c...|          1402|
|000e562887b1f2006...|          2780|
+--------------------+--------------+
only showing top 20 rows



In [21]:
df.count()

112650

In [22]:
# drop duplicate data (same order_id and new_product_id) that could cause
# redundant items in one order_id
df = df.dropDuplicates()
df.count()

102425

In [23]:
# merging several (product_id) rows into 1 row based on order_id as the key
df = df.groupBy('order_id').agg(F.collect_list('new_product_id'))

In [24]:
# check out the new DataFrame
df.show()

+--------------------+----------------------------+
|            order_id|collect_list(new_product_id)|
+--------------------+----------------------------+
|014405982914c2cde...|              [21358, 16159]|
|019886de8f385a39b...|                     [11533]|
|01a6ad782455876aa...|                       [813]|
|01d907b3e209269e1...|                     [17981]|
|028dc52e12ddda803...|                     [22284]|
|036dd381dfb3ec75e...|                     [19360]|
|03ebfa9712b7dbc70...|                     [26097]|
|0420da8d50a378401...|                      [9625]|
|05afef1c185862cab...|                     [27869]|
|05bef443b85068505...|                     [30938]|
|05d46826bd0fb605d...|                      [5777]|
|06fec3f6e9e72edb6...|                     [22557]|
|077700dcf4e3bb412...|                      [8695]|
|078c2d9bb7e5905e8...|              [17526, 10024]|
|08791ec24b12af5af...|                     [26469]|
|08b450b388221bfe5...|                     [14602]|
|0957ed87011

In [25]:
# change column name from 'collect_list(new_product_id)' to 'items'
df = df.withColumnRenamed('collect_list(new_product_id)', 'items')

In [26]:
df.show()

+--------------------+--------------+
|            order_id|         items|
+--------------------+--------------+
|014405982914c2cde...|[21358, 16159]|
|019886de8f385a39b...|       [11533]|
|01a6ad782455876aa...|         [813]|
|01d907b3e209269e1...|       [17981]|
|028dc52e12ddda803...|       [22284]|
|036dd381dfb3ec75e...|       [19360]|
|03ebfa9712b7dbc70...|       [26097]|
|0420da8d50a378401...|        [9625]|
|05afef1c185862cab...|       [27869]|
|05bef443b85068505...|       [30938]|
|05d46826bd0fb605d...|        [5777]|
|06fec3f6e9e72edb6...|       [22557]|
|077700dcf4e3bb412...|        [8695]|
|078c2d9bb7e5905e8...|[17526, 10024]|
|08791ec24b12af5af...|       [26469]|
|08b450b388221bfe5...|       [14602]|
|0957ed870116e596b...|       [19775]|
|09868b0eb512bfbbe...|       [32354]|
|099d60cb800db65a7...|       [28857]|
|09e90e3936db197d4...|        [7627]|
+--------------------+--------------+
only showing top 20 rows



## Training Models

### Model 1

In [27]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.05, minConfidence=0.1)
model1 = fpGrowth.fit(df)

In [28]:
# display frequent itemsets
model1.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



### Model 2

In [29]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.005, minConfidence=0.01)
model2 = fpGrowth.fit(df)

In [30]:
# display frequent itemsets
model2.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



### Model 3

In [31]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.0005, minConfidence=0.001)
model3 = fpGrowth.fit(df)

In [32]:
# display frequent itemsets
model3.freqItemsets.show()

+-------+----+
|  items|freq|
+-------+----+
| [9661]| 467|
|[13430]| 431|
|[14051]| 352|
| [8290]| 323|
|[30293]| 311|
|  [793]| 306|
| [4598]| 291|
|[32098]| 287|
| [1749]| 269|
| [5822]| 259|
|[29129]| 255|
| [3047]| 225|
|[11492]| 194|
|[29568]| 194|
|[16151]| 187|
|[29348]| 172|
|[32139]| 160|
|[20571]| 158|
|[32025]| 156|
|[21395]| 156|
+-------+----+
only showing top 20 rows



### Model 4

In [33]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.00005, minConfidence=0.0001)
model4 = fpGrowth.fit(df)

In [34]:
# display frequent itemsets
model4.freqItemsets.show()

+-------+----+
|  items|freq|
+-------+----+
|[28923]|  42|
| [7863]|   5|
|[11481]|  26|
|[26310]|  20|
|[20444]|   8|
| [2566]|   8|
| [6110]|   5|
|[12663]|   7|
|[23957]|   5|
| [3249]|  16|
| [5731]|  14|
|[25270]|  12|
|[29351]|   7|
|[27632]|   6|
|[29901]|   6|
|[32724]|  11|
|[24931]|  10|
| [1098]|   9|
|[29365]|   6|
| [9019]|   5|
+-------+----+
only showing top 20 rows



## Generating Rules

Since model1 and model2 didn't produce any itemset, thus we're going to use model3's and model4's configuration as a sample to generate rules.

In [35]:
# display generated association rules
model3.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



Apparently model3 generates no rule, now let's give it a try with model4.

In [36]:
# display generated association rules
model4.associationRules.show()

+----------+----------+--------------------+
|antecedent|consequent|          confidence|
+----------+----------+--------------------+
|   [20490]|   [29010]|  0.1076923076923077|
|   [29653]|   [11053]|0.060240963855421686|
|    [6038]|    [9661]| 0.21428571428571427|
|   [13176]|   [19813]|0.036231884057971016|
|   [19861]|   [18705]| 0.15789473684210525|
|   [18705]|   [19861]|                0.24|
|   [26139]|   [15012]|  0.2857142857142857|
|    [7358]|   [26457]| 0.46153846153846156|
|   [21395]|    [6869]| 0.21794871794871795|
|   [26457]|    [7358]| 0.14285714285714285|
|   [22583]|   [29771]|  0.3157894736842105|
|   [13240]|   [26389]|  0.6666666666666666|
|    [9661]|   [32025]| 0.06209850107066381|
|    [9661]|    [6038]| 0.01284796573875803|
|   [26252]|    [6869]| 0.14814814814814814|
|   [30293]|   [14051]| 0.03536977491961415|
|   [30293]|    [4598]| 0.01607717041800643|
|   [30293]|   [32098]|0.028938906752411574|
|   [30895]|   [21478]|                 0.4|
|   [29771

## Testing Custom Inputs against Rules

### Input 1

In [37]:
input1 = spark.createDataFrame([
    (0, [32098])
], ["order_id", "items"])

In [38]:
model4.transform(input1).show()

+--------+-------+--------------------+
|order_id|  items|          prediction|
+--------+-------+--------------------+
|       0|[32098]|[14051, 30293, 4598]|
+--------+-------+--------------------+



### Input 2

In [39]:
input2 = spark.createDataFrame([
    (0, [32098]),
    (1, [30293, 4598])
], ["order_id", "items"])

In [40]:
model4.transform(input2).show()

+--------+-------------+--------------------+
|order_id|        items|          prediction|
+--------+-------------+--------------------+
|       0|      [32098]|[14051, 30293, 4598]|
|       1|[30293, 4598]|      [14051, 32098]|
+--------+-------------+--------------------+



### Input 3

In [41]:
input3 = spark.createDataFrame([
    (0, [32098]),
    (1, [30293, 4598]),
    (2, [14051, 32098, 4598]),
    (3, [14])
], ["order_id", "items"])

In [42]:
model4.transform(input3).show()

+--------+--------------------+--------------------+
|order_id|               items|          prediction|
+--------+--------------------+--------------------+
|       0|             [32098]|[14051, 30293, 4598]|
|       1|       [30293, 4598]|      [14051, 32098]|
|       2|[14051, 32098, 4598]|       [30293, 7345]|
|       3|                [14]|                  []|
+--------+--------------------+--------------------+



## References

1. https://kaggle.com/olistbr/brazilian-ecommerce
2. https://spark.apache.org/docs/2.3.0/ml-frequent-pattern-mining.html
3. https://stackoverflow.com/questions/29600673/how-to-delete-columns-in-pyspark-dataframe
4. https://stackoverflow.com/questions/40945174/groupbykey-and-create-lists-of-values-pyspark-sql-dataframe
5. https://stackoverflow.com/questions/37249291/fp-growth-items-in-a-transaction-must-be-unique
6. https://stackoverflow.com/questions/32086578/how-to-add-row-id-in-pyspark-dataframes
7. https://stackoverflow.com/questions/33092723/performing-lookup-translation-in-a-spark-rdd-or-data-frame-using-another-rdd-df