## Recommendation system using ARM on instacart dataset

Dataset: https://mdporter.github.io/DS6030/other/instacart.html

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

### Loading dataset and preprocessing

In [7]:
# load the products-dataset
product = pd.read_csv("../../data/products.csv")
product.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [8]:
# load the order-dataset
order = pd.read_csv("../../data/order_products__train.csv")
order.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [9]:
# Checking for null
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49688 entries, 0 to 49687
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_id     49688 non-null  int64 
 1   product_name   49688 non-null  object
 2   aisle_id       49688 non-null  int64 
 3   department_id  49688 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [10]:
order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype
---  ------             --------------    -----
 0   order_id           1384617 non-null  int64
 1   product_id         1384617 non-null  int64
 2   add_to_cart_order  1384617 non-null  int64
 3   reordered          1384617 non-null  int64
dtypes: int64(4)
memory usage: 42.3 MB


In [21]:
# Merging the dataset
merged_dataset = pd.merge(order, product, on="product_id", how="left")

In [22]:
merged_dataset.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,1,49302,1,1,Bulgarian Yogurt,120,16
1,1,11109,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16
2,1,10246,3,0,Organic Celery Hearts,83,4
3,1,49683,4,0,Cucumber Kirby,83,4
4,1,43633,5,1,Lightly Smoked Sardines in Olive Oil,95,15


In [23]:
merged_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384617 entries, 0 to 1384616
Data columns (total 7 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   order_id           1384617 non-null  int64 
 1   product_id         1384617 non-null  int64 
 2   add_to_cart_order  1384617 non-null  int64 
 3   reordered          1384617 non-null  int64 
 4   product_name       1384617 non-null  object
 5   aisle_id           1384617 non-null  int64 
 6   department_id      1384617 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 73.9+ MB


In [26]:
# Grouping items according to order
dataset = pd.DataFrame(merged_dataset.groupby("order_id")['product_name'].nunique().index)

In [30]:
dataset["products"] = merged_dataset.groupby('order_id')['product_name'].unique().values

In [31]:
dataset.head()

Unnamed: 0,order_id,products
0,1,"[Bulgarian Yogurt, Organic 4% Milk Fat Whole M..."
1,36,"[Grated Pecorino Romano Cheese, Spring Water, ..."
2,38,"[Shelled Pistachios, Organic Biologique Limes,..."
3,96,"[Roasted Turkey, Organic Cucumber, Organic Gra..."
4,98,"[Natural Spring Water, Organic Orange Juice Wi..."


In [32]:
transactions = dataset['products'].tolist()

In [34]:
transactions[:2]

[array(['Bulgarian Yogurt',
        'Organic 4% Milk Fat Whole Milk Cottage Cheese',
        'Organic Celery Hearts', 'Cucumber Kirby',
        'Lightly Smoked Sardines in Olive Oil', 'Bag of Organic Bananas',
        'Organic Hass Avocado', 'Organic Whole String Cheese'],
       dtype=object),
 array(['Grated Pecorino Romano Cheese', 'Spring Water',
        'Organic Half & Half', 'Super Greens Salad',
        'Cage Free Extra Large Grade AA Eggs', 'Prosciutto, Americano',
        'Organic Garnet Sweet Potato (Yam)', 'Asparagus'], dtype=object)]

### Apriori Algorithm

In [35]:
from apyori import apriori

In [36]:
rules = apriori(transactions = transactions, min_support=0.00030, min_confidance=0.01, min_lift=3, min_length=2, max_length=2)

#let's transform them into a list
results = list(rules)

def inspect(results):
    '''
    function to put the result in well organised pandas dataframe
    '''
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))

resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Item #1', 'Item #2', 'Support', 'Confidence', 'Lift'])
resultsinDataFrame.head()

Unnamed: 0,Item #1,Item #2,Support,Confidence,Lift
0,0% Greek Strained Yogurt,Clementines,0.000602,0.191283,18.577416
1,0% Greek Strained Yogurt,Hass Avocados,0.000343,0.108959,6.281362
2,0% Greek Strained Yogurt,Raspberries,0.000312,0.099274,3.972428
3,0% Greek Strained Yogurt,Soda,0.00045,0.142857,12.438051
4,0% Greek Strained Yogurt,Trail Mix,0.000389,0.123487,30.398807


In [37]:
# Let's sort all the rules by decreasing lift.
resultsinDataFrame.nlargest(n=10, columns='Lift')

Unnamed: 0,Item #1,Item #2,Support,Confidence,Lift
2880,Unsweetened Whole Milk Peach Greek Yogurt,Unsweetened Whole Milk Strawberry Yogurt,0.000381,0.531915,606.887142
2878,Unsweetened Whole Milk Blueberry Greek Yogurt,Unsweetened Whole Milk Strawberry Yogurt,0.000328,0.5,570.473913
2879,Unsweetened Whole Milk Mixed Berry Greek Yogurt,Unsweetened Whole Milk Strawberry Yogurt,0.000312,0.493976,563.600733
2877,Unsweetened Blackberry Water,Unsweetened Watermelon Water,0.000305,0.449438,440.077142
1300,Organic 4 Months Butternut Squash Carrots Appl...,Stage 1 Apples Sweet Potatoes Pumpkin & Bluebe...,0.000335,0.453608,407.654004
1538,Organic Blended Raspberry Whole Milk Greek Yogurt,Organic Greek Whole Milk Blended Strawberry Yo...,0.000305,0.416667,364.469444
57,Almond Milk Blueberry Yogurt,Almond Milk Peach Yogurt,0.000594,0.430939,353.394406
2433,"Organic Pears, Peas and Broccoli Puree Stage 1",Stage 1 Apples Sweet Potatoes Pumpkin & Bluebe...,0.00032,0.365217,328.217868
37,Acai Berry Chia Bar,Coconut Chia Bar,0.000381,0.409836,316.318708
58,Almond Milk Blueberry Yogurt,Almond Milk Strawberry Yogurt,0.00061,0.441989,310.122611


From the result, we can observe that strawberry yogurt is popular with greek yogurt.

### FP-Growth Algorithm

In [38]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

In [39]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,#2 Coffee Filters,#2 Cone White Coffee Filters,#2 Mechanical Pencils,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Acai Raspberry Water Beverage,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,...,with Sweet Cinnamon Bunches Cereal,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Unwrapped Spearmint 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131204,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131205,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131206,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
131207,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [45]:
fp_rules = fpgrowth(df, min_support=0.00030, max_len=3, use_colnames=True)

In [46]:
fp_rules

Unnamed: 0,support,itemsets
0,0.117980,(Bag of Organic Bananas)
1,0.055583,(Organic Hass Avocado)
2,0.018391,(Cucumber Kirby)
3,0.015190,(Organic Whole String Cheese)
4,0.008094,(Organic Celery Hearts)
...,...,...
25778,0.000412,"(Yogurt, Lowfat, Strawberry, Organic Peach Low..."
25779,0.000358,"(Bag of Organic Bananas, 100% Pure Apple Juice)"
25780,0.000351,"(Banana, 100% Pure Apple Juice)"
25781,0.000312,"(Bag of Organic Bananas, Lactose Free Fat Free..."


In [47]:
def filter_item(items):
    if len(items) != 1:
        return True
    return False

In [48]:
fp_rules[fp_rules.itemsets.apply(filter_item)].head(10)

Unnamed: 0,support,itemsets
5480,0.018444,"(Organic Hass Avocado, Bag of Organic Bananas)"
5481,0.011729,"(Organic Strawberries, Organic Hass Avocado)"
5482,0.009542,"(Organic Hass Avocado, Organic Baby Spinach)"
5483,0.007217,"(Banana, Organic Hass Avocado)"
5484,0.005587,"(Organic Hass Avocado, Large Lemon)"
5485,0.005411,"(Organic Strawberries, Organic Hass Avocado, B..."
5486,0.002515,"(Organic Strawberries, Organic Hass Avocado, O..."
5487,0.003788,"(Organic Hass Avocado, Organic Baby Spinach, B..."
5488,0.001623,"(Banana, Organic Strawberries, Organic Hass Av..."
5489,0.001341,"(Banana, Organic Hass Avocado, Organic Baby Sp..."


In [49]:
def filter_item(items):
    if len(items) == 3:
        return True
    return False

In [59]:
# Set the maximum column width
pd.set_option('display.max_colwidth', 100)

fp_rules[fp_rules.itemsets.apply(filter_item)].head(10)

Unnamed: 0,support,itemsets
5485,0.005411,"(Organic Strawberries, Organic Hass Avocado, Bag of Organic Bananas)"
5486,0.002515,"(Organic Strawberries, Organic Hass Avocado, Organic Baby Spinach)"
5487,0.003788,"(Organic Hass Avocado, Organic Baby Spinach, Bag of Organic Bananas)"
5488,0.001623,"(Banana, Organic Strawberries, Organic Hass Avocado)"
5489,0.001341,"(Banana, Organic Hass Avocado, Organic Baby Spinach)"
5490,0.001532,"(Organic Hass Avocado, Large Lemon, Bag of Organic Bananas)"
5491,0.00128,"(Organic Hass Avocado, Large Lemon, Organic Baby Spinach)"
5492,0.001219,"(Organic Strawberries, Organic Hass Avocado, Large Lemon)"
5493,0.001319,"(Banana, Organic Hass Avocado, Large Lemon)"
5539,0.000358,"(Bag of Organic Bananas, Organic Baby Spinach, Cucumber Kirby)"


From the result, we can observe that a lot of organic products are brought together.