# Market Basket Analysis with Frequent Pattern Algorithms

In [64]:
from utilities.std_imports import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import collect_set, col, count
from pyspark.ml import fpm

In [6]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

In [8]:
path = 'D:/data/csv/market_basket/'
aisles = sql.read.csv(path + "aisles.csv", header=True, inferSchema=True)
departments = sql.read.csv(path + "departments.csv", header=True, inferSchema=True)
order_products_prior = sql.read.csv(path + "order_products__prior.csv", header=True, inferSchema=True)
order_products_train = sql.read.csv(path + "order_products__train.csv", header=True, inferSchema=True)
orders = sql.read.csv(path + "orders.csv", header=True, inferSchema=True)
products = sql.read.csv(path + "products.csv", header=True, inferSchema=True)

#### Create Temporary Tables to work using sql like commands

In [9]:
aisles.createOrReplaceTempView("aisles")
departments.createOrReplaceTempView("departments")
order_products_prior.createOrReplaceTempView("order_products_prior")
order_products_train.createOrReplaceTempView("order_products_train")
orders.createOrReplaceTempView("orders")
products.createOrReplaceTempView("products")

## Data analysis : queries

#### Top 5 orders in the orders dataframe

In [10]:
orders.show(n=5)

+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                  15.0|
|  473747|      1|   prior|           3|        3|               12|                  21.0|
| 2254736|      1|   prior|           4|        4|                7|                  29.0|
|  431534|      1|   prior|           5|        4|               15|                  28.0|
+--------+-------+--------+------------+---------+-----------------+----------------------+
only showing top 5 rows



In [13]:
products.show(n=5)
order_products_train.show(n=5)
order_products_prior.show(n=5)
departments.show(n=5)
aisles.show(n=5)

+----------+--------------------+--------+-------------+
|product_id|        product_name|aisle_id|department_id|
+----------+--------------------+--------+-------------+
|         1|Chocolate Sandwic...|      61|           19|
|         2|    All-Seasons Salt|     104|           13|
|         3|Robust Golden Uns...|      94|            7|
|         4|Smart Ones Classi...|      38|            1|
|         5|Green Chile Anyti...|       5|           13|
+----------+--------------------+--------+-------------+
only showing top 5 rows

+--------+----------+-----------------+---------+
|order_id|product_id|add_to_cart_order|reordered|
+--------+----------+-----------------+---------+
|       1|     49302|                1|        1|
|       1|     11109|                2|        1|
|       1|     10246|                3|        0|
|       1|     49683|                4|        0|
|       1|     43633|                5|        1|
+--------+----------+-----------------+---------+
only showing

In [34]:
query = sql.sql("select count(order_id) as total_orders, (case when order_dow = '0' then 'Sunday' when order_dow = '1' then 'Monday' when order_dow = '2' then 'Tuesday' when order_dow = '3' then 'Wednesday' when order_dow = '4' then 'Thursday' when order_dow = '5' then 'Friday'  when order_dow = '6' then 'Saturday' end) as day_of_week from orders group by order_dow order by total_orders desc") 
query.show()

+------------+-----------+
|total_orders|day_of_week|
+------------+-----------+
|      600905|     Sunday|
|      587478|     Monday|
|      467260|    Tuesday|
|      453368|     Friday|
|      448761|   Saturday|
|      436972|  Wednesday|
|      426339|   Thursday|
+------------+-----------+



In [33]:
query = sql.sql("select count(order_id) as total_orders, order_hour_of_day as hour from orders group by order_hour_of_day order by order_hour_of_day")
query.show()

+------------+----+
|total_orders|hour|
+------------+----+
|       22758|   0|
|       12398|   1|
|        7539|   2|
|        5474|   3|
|        5527|   4|
|        9569|   5|
|       30529|   6|
|       91868|   7|
|      178201|   8|
|      257812|   9|
|      288418|  10|
|      284728|  11|
|      272841|  12|
|      277999|  13|
|      283042|  14|
|      283639|  15|
|      272553|  16|
|      228795|  17|
|      182912|  18|
|      140569|  19|
+------------+----+
only showing top 20 rows



In [57]:
query = sql.sql("select count(opp.order_id) as orders, p.product_name as popular_product from order_products_prior opp, products p where p.product_id = opp.product_id group by popular_product order by orders desc limit 10")
query.show()

+------+--------------------+
|orders|     popular_product|
+------+--------------------+
|472565|              Banana|
|379450|Bag of Organic Ba...|
|264683|Organic Strawberries|
|241921|Organic Baby Spinach|
|213584|Organic Hass Avocado|
|176815|     Organic Avocado|
|152657|         Large Lemon|
|142951|        Strawberries|
|140627|               Limes|
|137905|  Organic Whole Milk|
+------+--------------------+



## Market basket analysis with FPG

In [61]:
rawData = sql.sql("select p.product_name, o.order_id from products p inner join order_products_train o where o.product_id = p.product_id")
baskets = rawData.groupBy('order_id').agg(collect_set('product_name').alias('items'))
baskets.createOrReplaceTempView('baskets')
rawData.show(5)
baskets.show(5)
print((baskets.count(), len(baskets.columns)))

+--------------------+--------+
|        product_name|order_id|
+--------------------+--------+
|    Bulgarian Yogurt|       1|
|Organic 4% Milk F...|       1|
|Organic Celery He...|       1|
|      Cucumber Kirby|       1|
|Lightly Smoked Sa...|       1|
+--------------------+--------+
only showing top 5 rows

+--------+--------------------+
|order_id|               items|
+--------+--------------------+
|    1342|[Raw Shrimp, Seed...|
|    1591|[Cracked Wheat, S...|
|    4519|[Beet Apple Carro...|
|    4935|             [Vodka]|
|    6357|[Globe Eggplant, ...|
+--------+--------------------+
only showing top 5 rows

(131209, 2)


In [65]:
baskets_ds = sql.sql("select items from baskets").toDF("items")

fpg = fpm.FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0)
fpg_fit = fpg.fit(baskets_ds)

In [66]:
mostPopularItemInABasket = fpg_fit.freqItemsets
mostPopularItemInABasket.createOrReplaceTempView("mostPopularItemInABasket")

In [67]:
query = sql.sql("select items, freq from mostPopularItemInABasket where size(items) > 2 order by freq desc limit 20")
query.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[Organic Hass Avo...| 710|
|[Organic Raspberr...| 649|
|[Organic Baby Spi...| 587|
|[Organic Raspberr...| 531|
|[Organic Hass Avo...| 497|
|[Organic Avocado,...| 484|
|[Organic Avocado,...| 477|
|[Limes, Large Lem...| 452|
|[Organic Cucumber...| 424|
|[Limes, Organic A...| 389|
|[Organic Raspberr...| 381|
|[Organic Avocado,...| 379|
|[Organic Baby Spi...| 376|
|[Organic Blueberr...| 374|
|[Large Lemon, Org...| 371|
|[Organic Cucumber...| 366|
|[Organic Lemon, O...| 353|
|[Limes, Organic A...| 352|
|[Organic Whole Mi...| 339|
|[Organic Avocado,...| 334|
+--------------------+----+



#### Display generated association rules

In [69]:
rules = fpg_fit.associationRules
rules.createOrReplaceTempView("rules")
query = sql.sql("select * from rules where lift > 1 order by lift desc")
query.show()

+--------------------+--------------------+-------------------+------------------+
|          antecedent|          consequent|         confidence|              lift|
+--------------------+--------------------+-------------------+------------------+
|[Strawberry Rhuba...| [Blueberry Yoghurt]| 0.3096646942800789| 80.29801358062228|
| [Blueberry Yoghurt]|[Strawberry Rhuba...| 0.3102766798418972| 80.29801358062227|
|[Icelandic Style ...|[Nonfat Icelandic...| 0.2170212765957447| 78.66062066533443|
|[Nonfat Icelandic...|[Icelandic Style ...|0.42265193370165743| 78.66062066533442|
|[Icelandic Style ...|[Non Fat Acai & M...| 0.2397163120567376| 74.88794663964877|
|[Non Fat Acai & M...|[Icelandic Style ...| 0.4023809523809524| 74.88794663964876|
|[Blackberry Cucum...|[Kiwi Sandia Spar...|0.25675675675675674| 72.44902644580064|
|[Kiwi Sandia Spar...|[Blackberry Cucum...| 0.2860215053763441| 72.44902644580063|
|[Icelandic Style ...|[Non Fat Raspberr...| 0.3120567375886525| 71.08446611505121|
|[No

In [73]:
query = sql.sql("select antecedent as if, consequent as then, confidence from rules order by confidence desc limit 20")
query.show()

+--------------------+--------------------+-------------------+
|                  if|                then|         confidence|
+--------------------+--------------------+-------------------+
|[Organic Raspberr...|[Bag of Organic B...| 0.5984251968503937|
|[Organic Cucumber...|[Bag of Organic B...|           0.546875|
|[Organic Kiwi, Or...|[Bag of Organic B...| 0.5459770114942529|
|[Organic Navel Or...|[Bag of Organic B...| 0.5412186379928315|
|[Yellow Onions, S...|            [Banana]| 0.5357142857142857|
|[Organic Whole St...|[Bag of Organic B...| 0.5314685314685315|
|[Organic Navel Or...|[Bag of Organic B...| 0.5283018867924528|
|[Organic Raspberr...|[Bag of Organic B...|  0.521099116781158|
|[Organic D'Anjou ...|[Bag of Organic B...| 0.5170454545454546|
|[Organic Unsweete...|[Bag of Organic B...| 0.5141065830721003|
|[Organic Broccoli...|[Bag of Organic B...| 0.5048231511254019|
|[Organic Lemon, O...|[Bag of Organic B...| 0.4989106753812636|
|[Organic Hass Avo...|[Bag of Organic B.

In [5]:
spark.stop()

## Credits & Links

https://medium.com/analytics-vidhya/market-basket-analysis-on-3-million-orders-from-instacart-using-spark-24cc6469a92e