## Initialization

In [1]:
# import findspark
import findspark
findspark.init()

In [2]:
# import SparkSession
from pyspark.sql import SparkSession

In [3]:
# import FPGrowth
from pyspark.ml.fpm import FPGrowth

In [4]:
# import functions
from pyspark.sql import functions as F

In [5]:
# create session
spark = SparkSession \
    .builder \
    .appName("FPGrowthExample") \
    .getOrCreate()

In [6]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x000000CEB4287278>


In [7]:
# read dataset
df = spark.read.csv("olist_order_items_dataset.csv", header=True, inferSchema=True)

In [8]:
# find out attributes
df.head()

Row(order_id='00010242fe8c5a6d1ba2dd792cb16214', order_item_id=1, product_id='4244733e06e7ecb4970a6e2683c13e61', seller_id='48436dade18ac8b2bce089ec2a041202', shipping_limit_date=datetime.datetime(2017, 9, 19, 9, 45, 35), price=58.9, freight_value=13.29)

In [9]:
# find out each schema
df.schema

StructType(List(StructField(order_id,StringType,true),StructField(order_item_id,IntegerType,true),StructField(product_id,StringType,true),StructField(seller_id,StringType,true),StructField(shipping_limit_date,TimestampType,true),StructField(price,DoubleType,true),StructField(freight_value,DoubleType,true)))

## Pre-Processing

In [10]:
# drop unnecessary columns
df = df.drop('order_item_id','seller_id','shipping_limit_date','price','freight_value')

In [11]:
# find out new attributes
df.head()

Row(order_id='00010242fe8c5a6d1ba2dd792cb16214', product_id='4244733e06e7ecb4970a6e2683c13e61')

In [12]:
# find out new schema
df.schema

StructType(List(StructField(order_id,StringType,true),StructField(product_id,StringType,true)))

In [13]:
# find out dataset's length
df.count()

112650

In [14]:
# drop duplicate datas (same order_id and product_id) that could cause
# redundant items in one order_id
df = df.dropDuplicates()
df.count()

102425

In [15]:
# joining several (product_id) rows into 1 row based on order_id as the key
df = df.groupBy('order_id').agg(F.collect_list('product_id'))

In [16]:
# check out the new DataFrame
df.show()

+--------------------+------------------------+
|            order_id|collect_list(product_id)|
+--------------------+------------------------+
|014405982914c2cde...|    [e95ee6822b66ac60...|
|019886de8f385a39b...|    [e9a69340883a438c...|
|01a6ad782455876aa...|    [036734b5a58d5d4f...|
|01d907b3e209269e1...|    [b1434a8f79cb3528...|
|028dc52e12ddda803...|    [d86a6c48f83b045c...|
|036dd381dfb3ec75e...|    [aa8d88eb4b9cb388...|
|03ebfa9712b7dbc70...|    [aa6746e94490239d...|
|0420da8d50a378401...|    [5ca739ddd646d1ba...|
|05afef1c185862cab...|    [ac7e981115ad47f0...|
|05bef443b85068505...|    [44e086c4a977f37a...|
|05d46826bd0fb605d...|    [e84d1a4a08db13e8...|
|06fec3f6e9e72edb6...|    [8562e2c780a345b6...|
|077700dcf4e3bb412...|    [6cd0d08f09a8b324...|
|078c2d9bb7e5905e8...|    [ce5a5a85a0704113...|
|08791ec24b12af5af...|    [11250b0d4b709fee...|
|08b450b388221bfe5...|    [9a803c36a15de3fd...|
|0957ed870116e596b...|    [5b8423dc7f23089c...|
|09868b0eb512bfbbe...|    [f35927953ed82

In [17]:
# change column name from 'collect_list(product_id)' to 'items'
df = df.withColumnRenamed('collect_list(product_id)', 'items')

In [18]:
df.show()

+--------------------+--------------------+
|            order_id|               items|
+--------------------+--------------------+
|014405982914c2cde...|[e95ee6822b66ac60...|
|019886de8f385a39b...|[e9a69340883a438c...|
|01a6ad782455876aa...|[036734b5a58d5d4f...|
|01d907b3e209269e1...|[b1434a8f79cb3528...|
|028dc52e12ddda803...|[d86a6c48f83b045c...|
|036dd381dfb3ec75e...|[aa8d88eb4b9cb388...|
|03ebfa9712b7dbc70...|[aa6746e94490239d...|
|0420da8d50a378401...|[5ca739ddd646d1ba...|
|05afef1c185862cab...|[ac7e981115ad47f0...|
|05bef443b85068505...|[44e086c4a977f37a...|
|05d46826bd0fb605d...|[e84d1a4a08db13e8...|
|06fec3f6e9e72edb6...|[8562e2c780a345b6...|
|077700dcf4e3bb412...|[6cd0d08f09a8b324...|
|078c2d9bb7e5905e8...|[ce5a5a85a0704113...|
|08791ec24b12af5af...|[11250b0d4b709fee...|
|08b450b388221bfe5...|[9a803c36a15de3fd...|
|0957ed870116e596b...|[5b8423dc7f23089c...|
|09868b0eb512bfbbe...|[f35927953ed82e19...|
|099d60cb800db65a7...|[4621f88827b98333...|
|09e90e3936db197d4...|[e4c7ed7a8

## Training Models

### Model 1

In [19]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.05, minConfidence=0.1)
model1 = fpGrowth.fit(df)

In [20]:
# display frequent itemsets
model1.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



### Model 2

In [21]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.005, minConfidence=0.01)
model2 = fpGrowth.fit(df)

In [22]:
# display frequent itemsets
model2.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



### Model 3

In [23]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.0005, minConfidence=0.001)
model3= fpGrowth.fit(df)

In [24]:
# display frequent itemsets
model3.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[99a4788cb2485696...| 467|
|[aca2eb7d00ea1a7b...| 431|
|[422879e10f466829...| 352|
|[d1c427060a0f73f6...| 323|
|[389d119b48cf3043...| 311|
|[53b36df67ebb7c41...| 306|
|[368c6c730842d780...| 291|
|[53759a2ecddad2bb...| 287|
|[154e7e31ebfa0922...| 269|
|[2b4609f8948be188...| 259|
|[3dd2a17168ec895c...| 255|
|[7c1bd920dbdf2247...| 225|
|[e0d64dcfaa3b6db5...| 194|
|[5a848e4ab52fd544...| 194|
|[bb50f2e236e5eea0...| 187|
|[a62e25e09e05e6fa...| 172|
|[42a2c92a0979a949...| 160|
|[a92930c327948861...| 158|
|[e53e557d5a159f5a...| 156|
|[35afc973633aaeb6...| 156|
+--------------------+----+
only showing top 20 rows



### Model 4

In [25]:
fpGrowth = FPGrowth(itemsCol='items', minSupport=0.00005, minConfidence=0.0001)
model4= fpGrowth.fit(df)

In [26]:
# display frequent itemsets
model4.freqItemsets.count()

4211

## Generating Rules

We're going to use model4's configuration as a sample to generate rules.

In [27]:
# display generated association rules
model4.associationRules.show()

+--------------------+--------------------+--------------------+
|          antecedent|          consequent|          confidence|
+--------------------+--------------------+--------------------+
|[3f14d740544f37ec...|[36f60d45225e60c7...| 0.14814814814814814|
|[0aabfb375647d973...|[6c3effec7c8ddba4...|0.036231884057971016|
|[5d790355cbeded0c...|[5fc3e6a4b52b0c41...|               0.375|
|[4d0ec1e9b95fb62f...|[9ad75bd7267e5c72...| 0.46153846153846156|
|[ee57070aa3b24a06...|[0d85c435fd60b277...|  0.2608695652173913|
|[18486698933fbb64...|[dbb67791e405873b...|  0.4117647058823529|
|[35afc973633aaeb6...|[99a4788cb2485696...|  0.1858974358974359|
|[99a4788cb2485696...|[35afc973633aaeb6...| 0.06209850107066381|
|[99a4788cb2485696...|[f2e53dd1670f3c37...| 0.01284796573875803|
|[e53e557d5a159f5a...|[36f60d45225e60c7...| 0.21794871794871795|
|[53759a2ecddad2bb...|[422879e10f466829...|0.024390243902439025|
|[53759a2ecddad2bb...|[389d119b48cf3043...|  0.0313588850174216|
|[53759a2ecddad2bb...|[36