In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from apyori import apriori

---

# General data preparation

In [2]:
# order_products = pd.concat([pd.read_csv('../data/order_products__prior.csv'),
#                             pd.read_csv('../data/order_products__train.csv')])
order_products = pd.read_csv('../data/order_products__train.csv')     
products = pd.read_csv('../data/products.csv')
orders = pd.read_csv('../data/orders.csv')
departments = pd.read_csv('../data/departments.csv')
aisles = pd.read_csv('../data/aisles.csv')

In [3]:
# Join products with aisles to get product aisle categories
product_aisles = pd.merge(products, aisles, on="aisle_id", how="left")

# Join product with department to get product department categories
product_aisles_department = pd.merge(product_aisles, departments, on="department_id", how="left")

# Join order_products with products to get product categories
order_products_categories = pd.merge(order_products, product_aisles_department, on="product_id", how="left")

In [4]:
order_products_categories.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods


In [5]:
# Top 10 product categories ordered:
product_categories_count = (
    order_products_categories.groupby(["department"])
    .size()
    .reset_index(name="counts")
    .sort_values("counts", ascending=False)
)

In [6]:
product_categories_count.head(10)

Unnamed: 0,department,counts
19,produce,409087
7,dairy eggs,217051
20,snacks,118862
3,beverages,114046
10,frozen,100426
16,pantry,81242
2,bakery,48394
6,canned goods,46799
8,deli,44291
9,dry goods pasta,38713


In [7]:
# Form a list of top 10 categories
top_10_depts = list(product_categories_count.head(10)["department"])
print(top_10_depts)

# Filter order_products to only contain top 10 depts
order_products_filtered = order_products_categories[
    order_products_categories["department"].isin(top_10_depts)
]

order_products_filtered["qty"] = 1

['produce', 'dairy eggs', 'snacks', 'beverages', 'frozen', 'pantry', 'bakery', 'canned goods', 'deli', 'dry goods pasta']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_products_filtered["qty"] = 1


In [8]:
order_products_filtered["department"].unique()

array(['dairy eggs', 'produce', 'canned goods', 'beverages', 'deli',
       'snacks', 'pantry', 'frozen', 'bakery', 'dry goods pasta'],
      dtype=object)

In [9]:
order_products_filtered.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs,1
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs,1
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce,1
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce,1
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods,1


In [10]:
order_products_filtered[["product_name"]].nunique()

product_name    26306
dtype: int64

In [11]:
list(order_products_filtered[order_products_filtered["order_id"] == 1]["product_name"])

['Bulgarian Yogurt',
 'Organic 4% Milk Fat Whole Milk Cottage Cheese',
 'Organic Celery Hearts',
 'Cucumber Kirby',
 'Lightly Smoked Sardines in Olive Oil',
 'Bag of Organic Bananas',
 'Organic Hass Avocado',
 'Organic Whole String Cheese']

In [12]:
order_products_list = []
for order in order_products_filtered["order_id"].unique():
    print(order, end=' ')
    prods = list(order_products_filtered[order_products_filtered["order_id"] == order]["product_name"])
    order_products_list.append(prods)

1 36 38 96 98 112 170 218 226 349 393 456 473 631 762 774 844 878 904 915 988 1001 1032 1042 1077 1086 1119 1120 1139 1143 1145 1275 1280 1318 1325 1335 1342 1350 1468 1571 1572 1579 1591 1597 1620 1674 1682 1703 1721 1764 1851 1865 1890 1955 1983 1994 2011 2021 2029 2068 2087 2115 2191 2216 2318 2389 2415 2442 2445 2468 2530 2603 2711 2737 2822 2853 2869 2888 2889 2936 2937 2948 2985 3010 3037 3056 3068 3091 3103 3142 3160 3176 3179 3200 3209 3212 3243 3300 3309 3321 3327 3336 3349 3368 3378 3397 3405 3422 3473 3484 3509 3514 3527 3533 3563 3649 3733 3740 3752 3785 3817 3830 3861 3871 3898 3901 3938 3946 3957 3971 3979 4006 4011 4036 4067 4090 4092 4111 4146 4148 4164 4185 4189 4194 4201 4203 4217 4234 4250 4267 4269 4284 4305 4309 4340 4347 4361 4383 4415 4431 4437 4468 4494 4495 4505 4519 4556 4562 4577 4590 4633 4637 4638 4658 4677 4680 4695 4711 4726 4752 4772 4790 4801 4827 4844 4885 4914 4918 4968 4979 4984 4998 5029 5044 5046 5081 5116 5117 5299 5354 5356 5362 5365 5368 5404 54

In [13]:
order_products_list

[['Bulgarian Yogurt',
  'Organic 4% Milk Fat Whole Milk Cottage Cheese',
  'Organic Celery Hearts',
  'Cucumber Kirby',
  'Lightly Smoked Sardines in Olive Oil',
  'Bag of Organic Bananas',
  'Organic Hass Avocado',
  'Organic Whole String Cheese'],
 ['Grated Pecorino Romano Cheese',
  'Spring Water',
  'Organic Half & Half',
  'Super Greens Salad',
  'Cage Free Extra Large Grade AA Eggs',
  'Prosciutto, Americano',
  'Organic Garnet Sweet Potato (Yam)',
  'Asparagus'],
 ['Shelled Pistachios',
  'Organic Biologique Limes',
  'Organic Raw Unfiltered Apple Cider Vinegar',
  'Organic Baby Arugula',
  'Organic Hot House Tomato',
  'Green Peas',
  'Bunched Cilantro',
  'Flat Parsley, Bunch',
  'Fresh Dill'],
 ['Roasted Turkey',
  'Organic Cucumber',
  'Organic Grape Tomatoes',
  'Organic Pomegranate Kernels',
  'Organic Raspberries',
  'Organic Whole Strawberries',
  'Organic Blueberries'],
 ['Natural Spring Water',
  'Organic Orange Juice With Calcium & Vitamin D',
  'Whole Milk Greek Blen

order_products_list is a list of lists where each list consist of 1 transaction and its orders.  
For example:  

`[  
 ['Bulgarian Yogurt',
  'Organic 4% Milk Fat Whole Milk Cottage Cheese',
  'Organic Celery Hearts',
  'Cucumber Kirby',
  'Lightly Smoked Sardines in Olive Oil',
  'Bag of Organic Bananas',
  'Organic Hass Avocado',
  'Organic Whole String Cheese'],
 ['Grated Pecorino Romano Cheese',
  'Spring Water',
  'Organic Half & Half',
  'Super Greens Salad',
  'Cage Free Extra Large Grade AA Eggs',
  'Prosciutto, Americano',
  'Organic Garnet Sweet Potato (Yam)',
  'Asparagus']
]  
`  
This means that there are 2 transactions where the 1st transaction is the first list in order_products_list 

---

# Apriori Algorithm

#### Convert table to suitable format for apriori

In [14]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules,apriori
import time

support_threshold = 0.5

te = TransactionEncoder()
# te_ary = te.fit(order_products_list).transform(order_products_list)
# df = pd.DataFrame(te_ary, columns=te.columns_)

oht_ary = te.fit(order_products_list).transform(order_products_list, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
sparse_df


Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Acai Raspberry Water Beverage,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,...,with Sweet & Smoky BBQ Sauce Cheeseburger Sliders,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Unwrapped Spearmint 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128819,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
128820,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
128821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
128822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
sparse_df.shape

(128824, 26306)

#### Run apriori algo

In [16]:
frequent_itemsets = apriori(sparse_df, min_support=0.001, use_colnames=True)

In [None]:
frequent_itemsets.sort_values("support", ascending=False)

Error: Kernel is dead

The frequent itemsets describe the frequency of purchase of a product based on the total number all the transactions.  
This means that approximately 14% of transactions have bananas, 12% have organic bananas... 

---

#### Get lift table based on apriori frequent itemsets df

In [None]:
lift = association_rules(frequent_itemsets, metric="lift", min_threshold=0.01)

In [None]:
lift.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bag of Organic Bananas),(Apple Honeycrisp Organic),0.120164,0.017481,0.005333,0.04438,2.538716,0.003232,1.028148
1,(Apple Honeycrisp Organic),(Bag of Organic Bananas),0.017481,0.120164,0.005333,0.305062,2.538716,0.003232,1.266064
2,(Asparagus),(Bag of Organic Bananas),0.030025,0.120164,0.005651,0.188211,1.566285,0.002043,1.083824
3,(Bag of Organic Bananas),(Asparagus),0.120164,0.030025,0.005651,0.047028,1.566285,0.002043,1.017842
4,(Asparagus),(Banana),0.030025,0.145361,0.006156,0.205016,1.410388,0.001791,1.075038


In [None]:
lift.shape

(244, 9)

In [None]:
lift.sort_values("support", ascending= False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
48,(Bag of Organic Bananas),(Organic Strawberries),0.120164,0.084565,0.023862,0.198579,2.348239,0.013700,1.142265
49,(Organic Strawberries),(Bag of Organic Bananas),0.084565,0.120164,0.023862,0.282174,2.348239,0.013700,1.225695
35,(Organic Hass Avocado),(Bag of Organic Bananas),0.056612,0.120164,0.018785,0.331825,2.761436,0.011983,1.316775
34,(Bag of Organic Bananas),(Organic Hass Avocado),0.120164,0.056612,0.018785,0.156331,2.761436,0.011983,1.118196
19,(Bag of Organic Bananas),(Organic Baby Spinach),0.120164,0.075949,0.017357,0.144444,1.901872,0.008231,1.080060
...,...,...,...,...,...,...,...,...,...
243,(Organic Strawberries),"(Bag of Organic Bananas, Organic Raspberries)",0.084565,0.013817,0.005038,0.059574,4.311557,0.003869,1.048655
94,(Banana),(Organic Peeled Whole Baby Carrots),0.145361,0.019096,0.005015,0.034497,1.806547,0.002239,1.015952
95,(Organic Peeled Whole Baby Carrots),(Banana),0.019096,0.145361,0.005015,0.262602,1.806547,0.002239,1.158992
215,(Organic Zucchini),(Organic Hass Avocado),0.035622,0.056612,0.005007,0.140553,2.482746,0.002990,1.097669


The antecedent and the consequent is used to inform us that the antecedent affects the buying of the consequent.
For an example Antecedent {A} -> Consequent {B}, this means that there is a strong relationship between customers that purchased {A} and also purchased {B} in the same transaction.

Note that both antecedents and consequents can have multiple items. For example, {A, D} -> {B, C} is a valid rule.

Support is the relative frequency that the rules show up. In many instances, we can look for high support in order to make sure it is a useful relationship. However, there may be instances where a low support is useful if you are trying to find “hidden” relationships.

> Support(B) = (Transactions containing (B))/(Total Transactions)


Confidence is a measure of the reliability of the rule. A confidence of .5 in the above example would mean that in 50% of the cases where {A} and {D} were purchased, the purchase also included {B} and {C}. For product recommendation, a 50% confidence may be perfectly acceptable but in a medical situation, this level may not be high enough.

> Confidence(A→B) = (Transactions containing both (A and B))/(Transactions containing A)


Lift is the ratio of the observed support to that expected if the two rules were independent. The basic rule of thumb is that a lift value close to 1 means the rules were completely independent. Lift values > 1 are generally more “interesting” and could be indicative of a useful rule pattern.

> Lift(A→B) = (Confidence (A→B))/(Support (B))