In [1]:
import numpy as np
import pandas as pd

In [11]:
from pyspark.sql import SparkSession

In [13]:
spark = SparkSession.builder.appName('Ops').getOrCreate()

In [16]:
aisles = spark.read.csv("/Users/smoore/Downloads/instacart/aisles.csv", header=True, inferSchema=True)
aisles.show(n=5, truncate=False)

+--------+--------------------------+
|aisle_id|aisle                     |
+--------+--------------------------+
|1       |prepared soups salads     |
|2       |specialty cheeses         |
|3       |energy granola bars       |
|4       |instant foods             |
|5       |marinades meat preparation|
+--------+--------------------------+
only showing top 5 rows



In [17]:
departments = spark.read.csv("/Users/smoore/Downloads/instacart/departments.csv", header=True, inferSchema=True)
order_products_prior = spark.read.csv("/Users/smoore/Downloads/instacart/order_products__prior.csv", header=True, inferSchema=True)
order_products_train = spark.read.csv("/Users/smoore/Downloads/instacart/order_products__train.csv", header=True, inferSchema=True)
orders = spark.read.csv("/Users/smoore/Downloads/instacart/orders.csv", header=True, inferSchema=True)
products = spark.read.csv("/Users/smoore/Downloads/instacart/products.csv", header=True, inferSchema=True)
orders.show(n=5)

+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                  15.0|
|  473747|      1|   prior|           3|        3|               12|                  21.0|
| 2254736|      1|   prior|           4|        4|                7|                  29.0|
|  431534|      1|   prior|           5|        4|               15|                  28.0|
+--------+-------+--------+------------+---------+-----------------+----------------------+
only showing top 5 rows



In [19]:
aisles.createOrReplaceTempView("aisles")
order_products_prior.createOrReplaceTempView("order_products_prior")
order_products_train.createOrReplaceTempView("order_products_train")
orders.createOrReplaceTempView("orders")
products.createOrReplaceTempView("products")

In [26]:
from pyspark.sql.functions import collect_set, col, count
rawData = spark.sql("select p.product_name, o.order_id from products p inner join order_products_train o where o.product_id = p.product_id")
baskets = rawData.groupBy('order_id').agg(collect_set('product_name').alias('items'))
baskets.createOrReplaceTempView('baskets')
rawData.show(n=5, truncate=False)
print((baskets.count(),len(baskets.columns)))
baskets.show(n=5, truncate=False)

+---------------------------------------------+--------+
|product_name                                 |order_id|
+---------------------------------------------+--------+
|Bulgarian Yogurt                             |1       |
|Organic 4% Milk Fat Whole Milk Cottage Cheese|1       |
|Organic Celery Hearts                        |1       |
|Cucumber Kirby                               |1       |
|Lightly Smoked Sardines in Olive Oil         |1       |
+---------------------------------------------+--------+
only showing top 5 rows

(131209, 2)
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model = fpGrowth.fit(baskets)
model.freqItemsets.show(truncate=False)

+-----------------------------------------------------------------+-----+
|items                                                            |freq |
+-----------------------------------------------------------------+-----+
|[Organic Tomato Basil Pasta Sauce]                               |772  |
|[Organic Tomato Basil Pasta Sauce, Bag of Organic Bananas]       |175  |
|[Organic Tomato Basil Pasta Sauce, Organic Baby Spinach]         |144  |
|[Organic Tomato Basil Pasta Sauce, Banana]                       |179  |
|[Organic Spinach Bunch]                                          |475  |
|[Whole Milk Ricotta Cheese]                                      |347  |
|[Medium Salsa Roja]                                              |275  |
|[Ground Buffalo]                                                 |231  |
|[Tonic Water]                                                    |194  |
|[Original Coconut Milk Beverage]                                 |173  |
|[Low-Fat Strawberry Banana on the Bot

In [23]:
model.freqItemsets.sort(col("freq").desc()).show(truncate=False, n=500)

+--------------------------------------------------------------------+-----+
|items                                                               |freq |
+--------------------------------------------------------------------+-----+
|[Banana]                                                            |18726|
|[Bag of Organic Bananas]                                            |15480|
|[Organic Strawberries]                                              |10894|
|[Organic Baby Spinach]                                              |9784 |
|[Large Lemon]                                                       |8135 |
|[Organic Avocado]                                                   |7409 |
|[Organic Hass Avocado]                                              |7293 |
|[Strawberries]                                                      |6494 |
|[Limes]                                                             |6033 |
|[Organic Raspberries]                                               |5546 |

1) Support:
This measure gives an idea of how frequent an itemset is in all the transactions. Intuitively, for any basket A, support measures the % of transactions containing that basket as a subset.
2) Confidence:
This measure defines the likeliness of occurrence of consequent on the cart given that the cart already has the antecedents. Intuitively, let say there is a basket {a,b,c} having a support ‘s’, then if we are analyzing ({a} implies {b,c}), confidence is the % of the transactions having {a,b,c} that contains {b,c}.
3) Lift:
Lift controls for the support (frequency) of consequent while calculating the conditional probability of occurrence of {Y} given {X}. Lift is the most important parameter that supermarkets use to place products. Think of it as the lift that {X} provides to our confidence for having {Y} on the cart. To rephrase, lift is the rise in the probability of having {Y} on the cart with the knowledge of {X} being present over the probability of having {Y} on the cart without any knowledge about the presence of {X}.
Reference to understand these terms:
https://towardsdatascience.com/association-rules-2-aa9a77241654

In [24]:
model.associationRules.show(truncate=False)

+----------------------------------------------+----------------------------+-------------------+------------------+---------------------+
|antecedent                                    |consequent                  |confidence         |lift              |support              |
+----------------------------------------------+----------------------------+-------------------+------------------+---------------------+
|[Broccoli Crown, Organic Strawberries]        |[Banana]                    |0.3690773067331671 |2.5860442347085395|0.0011279714044006128|
|[Sugar Snap Peas]                             |[Bag of Organic Bananas]    |0.2207001522070015 |1.8706619038067482|0.001105107119176276 |
|[Organic Red Onion, Organic Strawberries]     |[Bag of Organic Bananas]    |0.34673366834170855|2.9389262202485296|0.0015776356804792354|
|[Organic Red Onion, Organic Strawberries]     |[Organic Baby Spinach]      |0.22780569514237856|3.0550038280801664|0.0010365142635032657|
|[85% Lean Ground Beef]    

References:
https://medium.com/analytics-vidhya/market-basket-analysis-on-3-million-orders-from-instacart-using-spark-24cc6469a92e
https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html