## Initialization

In [1]:
# import findspark
import findspark
findspark.init()

In [2]:
# import SparkSession
from pyspark.sql import SparkSession

In [3]:
# import FPGrowth
from pyspark.ml.fpm import FPGrowth

In [4]:
# import functions
from pyspark.sql import functions as F

In [5]:
# create session
spark = SparkSession \
    .builder \
    .appName("FPGrowthExample") \
    .getOrCreate()

In [6]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x000000A57B022E48>


In [7]:
# read dataset
df = spark.read.csv("olist_order_items_dataset.csv", header=True, inferSchema=True)

In [8]:
# find out attributes
df.head()

Row(order_id='00010242fe8c5a6d1ba2dd792cb16214', order_item_id=1, product_id='4244733e06e7ecb4970a6e2683c13e61', seller_id='48436dade18ac8b2bce089ec2a041202', shipping_limit_date=datetime.datetime(2017, 9, 19, 9, 45, 35), price=58.9, freight_value=13.29)

In [9]:
# find out each schema
df.schema

StructType(List(StructField(order_id,StringType,true),StructField(order_item_id,IntegerType,true),StructField(product_id,StringType,true),StructField(seller_id,StringType,true),StructField(shipping_limit_date,TimestampType,true),StructField(price,DoubleType,true),StructField(freight_value,DoubleType,true)))

## Pre-Processing

In [10]:
# drop unnecessary columns
df = df.drop('order_item_id','seller_id','shipping_limit_date','price','freight_value')

In [11]:
# find out new attributes
df.head()

Row(order_id='00010242fe8c5a6d1ba2dd792cb16214', product_id='4244733e06e7ecb4970a6e2683c13e61')

In [12]:
# find out new schema
df.schema

StructType(List(StructField(order_id,StringType,true),StructField(product_id,StringType,true)))

In [13]:
# drop duplicate datas (same order_id and product_id) that could cause
# redundant items in one order_id
df = df.dropDuplicates()

In [14]:
# joining several (product_id) rows into 1 row based on order_id as the key
df = df.groupBy('order_id').agg(F.collect_list('product_id'))

In [15]:
# check out the new DataFrame
df.show()

+--------------------+------------------------+
|            order_id|collect_list(product_id)|
+--------------------+------------------------+
|014405982914c2cde...|    [e95ee6822b66ac60...|
|019886de8f385a39b...|    [e9a69340883a438c...|
|01a6ad782455876aa...|    [036734b5a58d5d4f...|
|01d907b3e209269e1...|    [b1434a8f79cb3528...|
|028dc52e12ddda803...|    [d86a6c48f83b045c...|
|036dd381dfb3ec75e...|    [aa8d88eb4b9cb388...|
|03ebfa9712b7dbc70...|    [aa6746e94490239d...|
|0420da8d50a378401...|    [5ca739ddd646d1ba...|
|05afef1c185862cab...|    [ac7e981115ad47f0...|
|05bef443b85068505...|    [44e086c4a977f37a...|
|05d46826bd0fb605d...|    [e84d1a4a08db13e8...|
|06fec3f6e9e72edb6...|    [8562e2c780a345b6...|
|077700dcf4e3bb412...|    [6cd0d08f09a8b324...|
|078c2d9bb7e5905e8...|    [ce5a5a85a0704113...|
|08791ec24b12af5af...|    [11250b0d4b709fee...|
|08b450b388221bfe5...|    [9a803c36a15de3fd...|
|0957ed870116e596b...|    [5b8423dc7f23089c...|
|09868b0eb512bfbbe...|    [f35927953ed82

In [16]:
# change column name from 'collect_list(product_id)' to 'items'
df = df.withColumnRenamed('collect_list(product_id)', 'items')

In [17]:
df.show()

+--------------------+--------------------+
|            order_id|               items|
+--------------------+--------------------+
|014405982914c2cde...|[e95ee6822b66ac60...|
|019886de8f385a39b...|[e9a69340883a438c...|
|01a6ad782455876aa...|[036734b5a58d5d4f...|
|01d907b3e209269e1...|[b1434a8f79cb3528...|
|028dc52e12ddda803...|[d86a6c48f83b045c...|
|036dd381dfb3ec75e...|[aa8d88eb4b9cb388...|
|03ebfa9712b7dbc70...|[aa6746e94490239d...|
|0420da8d50a378401...|[5ca739ddd646d1ba...|
|05afef1c185862cab...|[ac7e981115ad47f0...|
|05bef443b85068505...|[44e086c4a977f37a...|
|05d46826bd0fb605d...|[e84d1a4a08db13e8...|
|06fec3f6e9e72edb6...|[8562e2c780a345b6...|
|077700dcf4e3bb412...|[6cd0d08f09a8b324...|
|078c2d9bb7e5905e8...|[ce5a5a85a0704113...|
|08791ec24b12af5af...|[11250b0d4b709fee...|
|08b450b388221bfe5...|[9a803c36a15de3fd...|
|0957ed870116e596b...|[5b8423dc7f23089c...|
|09868b0eb512bfbbe...|[f35927953ed82e19...|
|099d60cb800db65a7...|[4621f88827b98333...|
|09e90e3936db197d4...|[e4c7ed7a8