# Imports

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# from apyori import apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules,apriori
import time

---

# General data preparation

In [3]:
# order_products = pd.concat([pd.read_csv('../data/order_products__prior.csv'),
#                             pd.read_csv('../data/order_products__train.csv')])
order_products = pd.read_csv('../data/order_products__train.csv')     
products = pd.read_csv('../data/products.csv')
orders = pd.read_csv('../data/orders.csv')
departments = pd.read_csv('../data/departments.csv')
aisles = pd.read_csv('../data/aisles.csv')

In [4]:
# Join products with aisles to get product aisle categories
product_aisles = pd.merge(products, aisles, on="aisle_id", how="left")

# Join product with department to get product department categories
product_aisles_department = pd.merge(product_aisles, departments, on="department_id", how="left")

# Join order_products with products to get product categories
order_products_categories = pd.merge(order_products, product_aisles_department, on="product_id", how="left")

In [5]:
order_products_categories.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods


In [6]:
# Top 10 product categories ordered:
product_categories_count = (
    order_products_categories.groupby(["department"])
    .size()
    .reset_index(name="counts")
    .sort_values("counts", ascending=False)
)

In [7]:
product_categories_count.head(10)

Unnamed: 0,department,counts
19,produce,409087
7,dairy eggs,217051
20,snacks,118862
3,beverages,114046
10,frozen,100426
16,pantry,81242
2,bakery,48394
6,canned goods,46799
8,deli,44291
9,dry goods pasta,38713


In [8]:
# Form a list of top 10 categories
top_10_depts = list(product_categories_count.head(10)["department"])
print(top_10_depts)

# Filter order_products to only contain top 10 depts
order_products_filtered = order_products_categories[
    order_products_categories["department"].isin(top_10_depts)
]

order_products_filtered["qty"] = 1

['produce', 'dairy eggs', 'snacks', 'beverages', 'frozen', 'pantry', 'bakery', 'canned goods', 'deli', 'dry goods pasta']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  order_products_filtered["qty"] = 1


In [4]:
# order_products_filtered.to_csv("../data/order_products_filtered.csv", index=None)
order_products_filtered = pd.read_csv('../data/order_products_filtered.csv')

In [5]:
order_products_filtered.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs,1
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs,1
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce,1
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce,1
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods,1


In [6]:
order_products_filtered["department"].unique()

array(['dairy eggs', 'produce', 'canned goods', 'beverages', 'deli',
       'snacks', 'pantry', 'frozen', 'bakery', 'dry goods pasta'],
      dtype=object)

In [7]:
order_products_filtered.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,qty
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs,1
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs,1
2,1,10246,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce,1
3,1,49683,4,0,Cucumber Kirby,83,4,fresh vegetables,produce,1
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods,1


In [8]:
order_products_filtered[["product_name"]].nunique()

product_name    26306
dtype: int64

In [9]:
list(order_products_filtered[order_products_filtered["order_id"] == 1]["product_name"])

['Bulgarian Yogurt',
 'Organic 4% Milk Fat Whole Milk Cottage Cheese',
 'Organic Celery Hearts',
 'Cucumber Kirby',
 'Lightly Smoked Sardines in Olive Oil',
 'Bag of Organic Bananas',
 'Organic Hass Avocado',
 'Organic Whole String Cheese']

In [10]:
order_products_list = []
for order in order_products_filtered["order_id"].unique():
    # print(order, end=' ')
    prods = list(order_products_filtered[order_products_filtered["order_id"] == order]["product_name"])
    order_products_list.append(prods)

In [11]:
order_products_list[0]

['Bulgarian Yogurt',
 'Organic 4% Milk Fat Whole Milk Cottage Cheese',
 'Organic Celery Hearts',
 'Cucumber Kirby',
 'Lightly Smoked Sardines in Olive Oil',
 'Bag of Organic Bananas',
 'Organic Hass Avocado',
 'Organic Whole String Cheese']

order_products_list is a list of lists where each list consist of 1 transaction and its orders.  
For example:  

`[  
 ['Bulgarian Yogurt',
  'Organic 4% Milk Fat Whole Milk Cottage Cheese',
  'Organic Celery Hearts',
  'Cucumber Kirby',
  'Lightly Smoked Sardines in Olive Oil',
  'Bag of Organic Bananas',
  'Organic Hass Avocado',
  'Organic Whole String Cheese'],
 ['Grated Pecorino Romano Cheese',
  'Spring Water',
  'Organic Half & Half',
  'Super Greens Salad',
  'Cage Free Extra Large Grade AA Eggs',
  'Prosciutto, Americano',
  'Organic Garnet Sweet Potato (Yam)',
  'Asparagus']
]  
`  
This means that there are 2 transactions where the 1st transaction is the first list in order_products_list 

---

# Apriori Algorithm

#### Convert table to suitable format for apriori

In [13]:
support_threshold = 0.5

te = TransactionEncoder()
# te_ary = te.fit(order_products_list).transform(order_products_list)
# df = pd.DataFrame(te_ary, columns=te.columns_)

oht_ary = te.fit(order_products_list).transform(order_products_list, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
sparse_df.head()


Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Acai Raspberry Water Beverage,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,...,with Sweet & Smoky BBQ Sauce Cheeseburger Sliders,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Unwrapped Spearmint 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
sparse_df.shape

(128824, 26306)

In [25]:
sparse_df.head()

Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Acai Raspberry Water Beverage,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,...,with Sweet & Smoky BBQ Sauce Cheeseburger Sliders,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Unwrapped Spearmint 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Run apriori algo

In [89]:
# Here we determine the minimum number of transactions within the dataset that we 
# need to see an occurrence for it to be deemed "frequent"

min_occurrence = 50
min_support = min_occurrence/len(sparse_df)
print(f"This means that out of {len(sparse_df)} transactions, we will deem a 'frequent itemset'")
print(f"to have minimally appeared in all transactions at least {min_occurrence} times.")

print(f"Hence, min_support = {min_occurrence}/{len(sparse_df)}")
print(f"\t \t   = {min_support}")


This means that out of 128824 transactions, we will deem a 'frequent itemset'
to have minimally appeared in all transactions at least 20 times.
Hence, min_support = 20/128824
	 	   = 0.0001552505744271254


In [98]:
from mlxtend.frequent_patterns import fpgrowth, fpmax
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = fpgrowth(sparse_df, min_support=min_support, use_colnames=True)

frequent_itemsets.sort_values("support", ascending=False)

Unnamed: 0,support,itemsets
95,0.145361,(Banana)
0,0.120164,(Bag of Organic Bananas)
77,0.084565,(Organic Strawberries)
141,0.075949,(Organic Baby Spinach)
114,0.063148,(Large Lemon)
...,...,...
54181,0.000155,"(Whole Chia Seeds, Banana)"
43657,0.000155,"(Organic Tomato Cluster, Large Lemon, Bag of O..."
34581,0.000155,"(Organic Genoa Salami, Organic Raspberries, Or..."
66764,0.000155,"(Sweet Potato Tortilla Chips, Organic Strawber..."


The frequent itemsets describe the frequency of purchase of a product based on the total number all the transactions.  
This means that approximately 14% of transactions have bananas, 12% have organic bananas... 

#### Get rules based on frequent itemsets df

In [97]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.1)
# rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.001, support_only=True)

rules.sort_values("support", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
52816,(Organic Strawberries),(Bag of Organic Bananas),0.084565,0.120164,0.023862,0.282174,2.348239,0.013700,1.225695
52817,(Bag of Organic Bananas),(Organic Strawberries),0.120164,0.084565,0.023862,0.198579,2.348239,0.013700,1.142265
0,(Organic Hass Avocado),(Bag of Organic Bananas),0.056612,0.120164,0.018785,0.331825,2.761436,0.011983,1.316775
1,(Bag of Organic Bananas),(Organic Hass Avocado),0.120164,0.056612,0.018785,0.156331,2.761436,0.011983,1.118196
79348,(Organic Baby Spinach),(Bag of Organic Bananas),0.075949,0.120164,0.017357,0.228536,1.901872,0.008231,1.140476
...,...,...,...,...,...,...,...,...,...
113224,"(Organic Italian Parsley Bunch, Organic Rosemary)",(Organic Garlic),0.000427,0.032277,0.000155,0.363636,11.266256,0.000141,1.520708
113223,"(Organic Garlic, Organic Italian Parsley Bunch)",(Organic Rosemary),0.003819,0.003035,0.000155,0.040650,13.393217,0.000144,1.039209
113222,"(Organic Garlic, Organic Rosemary)",(Organic Italian Parsley Bunch),0.000543,0.018630,0.000155,0.285714,15.336190,0.000145,1.373918
113215,(Limes),"(Large Lemon, Organic Rosemary)",0.046831,0.000660,0.000155,0.003315,5.024288,0.000124,1.002664


The antecedent and the consequent is used to inform us that the antecedent affects the buying of the consequent.
For an example Antecedent {A} -> Consequent {B}, this means that there is a strong relationship between customers that purchased {A} and also purchased {B} in the same transaction.

Note that both antecedents and consequents can have multiple items. For example, {A, D} -> {B, C} is a valid rule.

Support is the relative frequency that the rules show up. In many instances, we can look for high support in order to make sure it is a useful relationship. However, there may be instances where a low support is useful if you are trying to find “hidden” relationships.

> Support(B) = (Transactions containing (B))/(Total Transactions)


Confidence is a measure of the reliability of the rule. A confidence of .5 in the above example would mean that in 50% of the cases where {A} and {D} were purchased, the purchase also included {B} and {C}. For product recommendation, a 50% confidence may be perfectly acceptable but in a medical situation, this level may not be high enough.

> Confidence(A→B) = (Transactions containing both (A and B))/(Transactions containing A)


Lift is the ratio of the observed support to that expected if the two rules were independent. The basic rule of thumb is that a lift value close to 1 means the rules were completely independent. Lift values > 1 are generally more “interesting” and could be indicative of a useful rule pattern.

> Lift(A→B) = (Confidence (A→B))/(Support (B))

Post filtering of all rules

In [102]:
rules['antecedents_len'] = rules['antecedents'].apply(lambda x: len(x))
rules['consequents_len'] = rules['consequents'].apply(lambda x: len(x))
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_len,consequents_len
0,(Organic Hass Avocado),(Bag of Organic Bananas),0.056612,0.120164,0.018785,0.331825,2.761436,0.011983,1.316775,1,1
1,(Bag of Organic Bananas),(Organic Hass Avocado),0.120164,0.056612,0.018785,0.156331,2.761436,0.011983,1.118196,1,1
2,(Organic Hass Avocado),(Organic Strawberries),0.056612,0.084565,0.011947,0.211024,2.495409,0.007159,1.160283,1,1
3,(Organic Strawberries),(Organic Hass Avocado),0.084565,0.056612,0.011947,0.141270,2.495409,0.007159,1.098586,1,1
4,(Organic Baby Spinach),(Organic Hass Avocado),0.075949,0.056612,0.009719,0.127964,2.260364,0.005419,1.081822,1,1
...,...,...,...,...,...,...,...,...,...,...,...
285419,(Organic Sweet Potato Fries),(Organic Strawberries),0.000660,0.084565,0.000210,0.317647,3.756248,0.000154,1.341586,1,1
285420,(Organic Baby Spinach),(Organic Sweet Potato Fries),0.075949,0.000660,0.000155,0.002044,3.098071,0.000105,1.001387,1,1
285421,(Organic Sweet Potato Fries),(Organic Baby Spinach),0.000660,0.075949,0.000155,0.235294,3.098071,0.000105,1.208375,1,1
285422,(YoKids Squeeze Organic Blueberry Blue Yogurt),(YoKids Squeeze! Organic Strawberry Flavor Yog...,0.000411,0.001459,0.000194,0.471698,323.223605,0.000193,1.890095,1,1


In [110]:
rules[rules['antecedents_len'] == 4]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_len,consequents_len
162,"(Large Lemon, Organic Baby Spinach, Bag of Org...",(Organic Strawberries),0.000419,0.084565,0.000155,0.370370,4.379713,0.000120,1.453926,4,1
163,"(Large Lemon, Organic Strawberries, Organic Ba...",(Organic Hass Avocado),0.000551,0.056612,0.000155,0.281690,4.975792,0.000124,1.313344,4,1
164,"(Organic Strawberries, Organic Baby Spinach, B...",(Large Lemon),0.001265,0.063148,0.000155,0.122699,1.943039,0.000075,1.067880,4,1
165,"(Large Lemon, Organic Baby Spinach, Organic St...",(Bag of Organic Bananas),0.000349,0.120164,0.000155,0.444444,3.698651,0.000113,1.583705,4,1
166,"(Organic Strawberries, Large Lemon, Bag of Org...",(Organic Baby Spinach),0.000559,0.075949,0.000155,0.277778,3.657445,0.000113,1.279456,4,1
...,...,...,...,...,...,...,...,...,...,...,...
245018,"(Organic Kiwi, Organic Strawberries, Organic H...",(Organic Raspberries),0.000605,0.043051,0.000233,0.384615,8.933951,0.000207,1.555042,4,1
245019,"(Organic Kiwi, Organic Hass Avocado, Bag of Or...",(Organic Strawberries),0.000427,0.084565,0.000233,0.545455,6.450123,0.000197,2.013957,4,1
245020,"(Organic Kiwi, Organic Strawberries, Organic R...",(Organic Hass Avocado),0.000582,0.056612,0.000233,0.400000,7.065625,0.000200,1.572313,4,1
245021,"(Organic Strawberries, Organic Hass Avocado, B...",(Organic Kiwi),0.001770,0.014594,0.000233,0.131579,9.016237,0.000207,1.134710,4,1


---