### Business Background

Customers make online grocery purchases from the Instacart grocery delivery service. When a customer buys an item, what are the related or complementary items that can be presented to them to promote cross-selling?
How can we mine association rules between various grocery items?

In [1]:
import sys
sys.path.append('/Users/owenwei/opt/anaconda3/lib/python3.9/site-packages/')
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)

from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from apyori import apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## Apriori algorithm

In [2]:
order = pd.read_csv('order_products__train.csv')
order

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [3]:
product = pd.read_csv('products.csv')
product

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [4]:
order['product_name'] = order['product_id'].map(product.set_index('product_id')['product_name'])
order

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name
0,1,49302,1,1,Bulgarian Yogurt
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese
2,1,10246,3,0,Organic Celery Hearts
3,1,49683,4,0,Cucumber Kirby
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil
...,...,...,...,...,...
1384612,3421063,14233,3,1,Natural Artesian Water
1384613,3421063,35548,4,1,Twice Baked Potatoes
1384614,3421070,35951,1,1,Organic Unsweetened Almond Milk
1384615,3421070,16953,2,1,Creamy Peanut Butter


In [5]:
grouped_order = order.groupby('order_id')['product_name'].apply(list).reset_index(name='products')
grouped_order

Unnamed: 0,order_id,products
0,1,"[Bulgarian Yogurt, Organic 4% Milk Fat Whole M..."
1,36,"[Grated Pecorino Romano Cheese, Spring Water, ..."
2,38,"[Shelled Pistachios, Organic Biologique Limes,..."
3,96,"[Roasted Turkey, Organic Cucumber, Organic Gra..."
4,98,"[Natural Spring Water, Organic Orange Juice Wi..."
...,...,...
131204,3421049,"[Gluten Free Rice Bread, Organic Whole Grain W..."
131205,3421056,"[Total Plain Greek Strained Yogurt, Homestyle ..."
131206,3421058,[Wine Infused Salame Cheese and Crackers Small...
131207,3421063,"[Organic Half & Half, No Salt Added Gluten-Fre..."


In [6]:
dataset = grouped_order['products']

In [7]:
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#2 Mechanical Pencils,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Acai Raspberry Water Beverage,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,...,with Sweet Cinnamon Bunches Cereal,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Unwrapped Spearmint 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131204,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131205,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131206,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131207,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.017514,(100% Whole Wheat Bread)
1,0.011737,(2% Reduced Fat Milk)
2,0.017163,(Apple Honeycrisp Organic)
3,0.029480,(Asparagus)
4,0.117980,(Bag of Organic Bananas)
...,...,...
115,0.010281,"(Organic Avocado, Large Lemon)"
116,0.010685,"(Organic Baby Spinach, Organic Avocado)"
117,0.012492,"(Organic Baby Spinach, Organic Strawberries)"
118,0.011729,"(Organic Hass Avocado, Organic Strawberries)"


In [9]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Filter rules based on lift
filtered_rules = rules[rules['lift'] > 1]
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bag of Organic Bananas),(Organic Baby Spinach),0.11798,0.074568,0.017042,0.144444,1.937082,0.008244,1.081674,0.548468
1,(Organic Baby Spinach),(Bag of Organic Bananas),0.074568,0.11798,0.017042,0.228536,1.937082,0.008244,1.143308,0.522739
2,(Bag of Organic Bananas),(Organic Hass Avocado),0.11798,0.055583,0.018444,0.156331,2.81256,0.011886,1.119416,0.730654
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.055583,0.11798,0.018444,0.331825,2.81256,0.011886,1.320044,0.682381
4,(Organic Raspberries),(Bag of Organic Bananas),0.042268,0.11798,0.013566,0.320952,2.7204,0.008579,1.298907,0.660318
5,(Bag of Organic Bananas),(Organic Raspberries),0.11798,0.042268,0.013566,0.114987,2.7204,0.008579,1.082167,0.716998
6,(Bag of Organic Bananas),(Organic Strawberries),0.11798,0.083028,0.023428,0.198579,2.391714,0.013633,1.144183,0.659724
7,(Organic Strawberries),(Bag of Organic Bananas),0.083028,0.11798,0.023428,0.282174,2.391714,0.013633,1.228738,0.634577
8,(Banana),(Large Lemon),0.142719,0.062,0.016447,0.115241,1.858714,0.007598,1.060175,0.538905
9,(Large Lemon),(Banana),0.062,0.142719,0.016447,0.265274,1.858714,0.007598,1.166803,0.492531


In [10]:
sorted_df = filtered_rules.sort_values('support', ascending=False)
sorted_df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
6,(Bag of Organic Bananas),(Organic Strawberries),0.11798,0.083028,0.023428,0.198579,2.391714,0.013633,1.144183,0.659724
7,(Organic Strawberries),(Bag of Organic Bananas),0.083028,0.11798,0.023428,0.282174,2.391714,0.013633,1.228738,0.634577
2,(Bag of Organic Bananas),(Organic Hass Avocado),0.11798,0.055583,0.018444,0.156331,2.81256,0.011886,1.119416,0.730654
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.055583,0.11798,0.018444,0.331825,2.81256,0.011886,1.320044,0.682381
0,(Bag of Organic Bananas),(Organic Baby Spinach),0.11798,0.074568,0.017042,0.144444,1.937082,0.008244,1.081674,0.548468
1,(Organic Baby Spinach),(Bag of Organic Bananas),0.074568,0.11798,0.017042,0.228536,1.937082,0.008244,1.143308,0.522739
12,(Banana),(Organic Avocado),0.142719,0.056467,0.016889,0.118338,2.095698,0.00883,1.070175,0.609872
13,(Organic Avocado),(Banana),0.056467,0.142719,0.016889,0.299096,2.095698,0.00883,1.223107,0.554122
17,(Organic Strawberries),(Banana),0.083028,0.142719,0.016569,0.199559,1.398269,0.004719,1.071012,0.31062
16,(Banana),(Organic Strawberries),0.142719,0.083028,0.016569,0.116095,1.398269,0.004719,1.037411,0.332248


In [11]:
sorted_df = filtered_rules.sort_values('lift', ascending=False)
sorted_df

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
21,(Large Lemon),(Limes),0.062,0.04598,0.012156,0.196066,4.264159,0.009305,1.18669,0.816085
20,(Limes),(Large Lemon),0.04598,0.062,0.012156,0.264379,4.264159,0.009305,1.275113,0.802381
31,(Organic Strawberries),(Organic Raspberries),0.083028,0.042268,0.012728,0.153295,3.62671,0.009218,1.131128,0.789847
30,(Organic Raspberries),(Organic Strawberries),0.042268,0.083028,0.012728,0.301118,3.62671,0.009218,1.312056,0.756233
23,(Large Lemon),(Organic Avocado),0.062,0.056467,0.010281,0.165827,2.936692,0.00678,1.131099,0.703071
22,(Organic Avocado),(Large Lemon),0.056467,0.062,0.010281,0.182076,2.936692,0.00678,1.146805,0.698948
3,(Organic Hass Avocado),(Bag of Organic Bananas),0.055583,0.11798,0.018444,0.331825,2.81256,0.011886,1.320044,0.682381
2,(Bag of Organic Bananas),(Organic Hass Avocado),0.11798,0.055583,0.018444,0.156331,2.81256,0.011886,1.119416,0.730654
5,(Bag of Organic Bananas),(Organic Raspberries),0.11798,0.042268,0.013566,0.114987,2.7204,0.008579,1.082167,0.716998
4,(Organic Raspberries),(Bag of Organic Bananas),0.042268,0.11798,0.013566,0.320952,2.7204,0.008579,1.298907,0.660318
