# Association Rules: Market Basket Analysis Instacart

* **Practical Business Python**: https://pbpython.com/market-basket-analysis.html
* **Market Basket Analysis Notebook**: https://github.com/chris1610/pbpython/blob/master/notebooks/Market_Basket_Intro.ipynb

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import time

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.model_selection import train_test_split
import random

In [2]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
products = pd.read_csv('../../data/01_raw/instacart_2017_05_01/products.csv')
aisles = pd.read_csv('../../data/01_raw/instacart_2017_05_01/aisles.csv')
departments = pd.read_csv('../../data/01_raw/instacart_2017_05_01/departments.csv')

order_products__prior = pd.read_csv('../../data/01_raw/instacart_2017_05_01/order_products__prior.csv')
order_test = pd.read_csv('../../data/01_raw/instacart_2017_05_01/orders.csv')

In [4]:
print('Products Row & Column Count: ', products.shape)
print('Aisles Row & Column Count: ', aisles.shape)
print('Department Row & Column Count: ', departments.shape)
print('Order Row & Column Count: ', order_products__prior.shape)

Products Row & Column Count:  (49688, 4)
Aisles Row & Column Count:  (134, 2)
Department Row & Column Count:  (21, 2)
Order Row & Column Count:  (32434489, 4)


### Combine Datasets

We need to make sure that we can match carts with the names of products as well as users. In order to do this we will need to combine the dataset on a common key. Let's first start by combining product and ailes dataset.

In [5]:
prod_ailes = products.merge(aisles, 
              how='outer', 
              on='aisle_id', 
               suffixes=('_x', '_y')
              )

In [6]:
product_dataset = prod_ailes.merge(departments, 
                how='outer', 
                on='department_id')

In [7]:
specific_orders = order_products__prior.merge(product_dataset, 
                how='left', 
                on='product_id')

In [8]:
baskets = specific_orders.merge(order_test, 
                     how='left', 
                     on='order_id')

There are some interesting columns that we don't need
1. add to cart order
1. aile_id
1. depaertment_id
1. aisle
1. eval_set
1. order_number
1. order_dow
1. hour of the day
 

In [9]:
baskets

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,Organic Egg Whites,86,16,eggs,dairy eggs,202279,prior,3,5,9,8.0
1,2,28985,2,1,Michigan Organic Kale,83,4,fresh vegetables,produce,202279,prior,3,5,9,8.0
2,2,9327,3,0,Garlic Powder,104,13,spices seasonings,pantry,202279,prior,3,5,9,8.0
3,2,45918,4,1,Coconut Butter,19,13,oils vinegars,pantry,202279,prior,3,5,9,8.0
4,2,30035,5,0,Natural Sweetener,17,13,baking ingredients,pantry,202279,prior,3,5,9,8.0
5,2,17794,6,1,Carrots,83,4,fresh vegetables,produce,202279,prior,3,5,9,8.0
6,2,40141,7,1,Original Unflavored Gelatine Mix,105,13,doughs gelatins bake mixes,pantry,202279,prior,3,5,9,8.0
7,2,1819,8,1,All Natural No Stir Creamy Almond Butter,88,13,spreads,pantry,202279,prior,3,5,9,8.0
8,2,43668,9,0,Classic Blend Cole Slaw,123,4,packaged vegetables fruits,produce,202279,prior,3,5,9,8.0
9,3,33754,1,1,Total 2% with Strawberry Lowfat Greek Strained...,120,16,yogurt,dairy eggs,205970,prior,16,5,17,12.0


In [8]:
baskets.drop(columns = ['add_to_cart_order', 'reordered', 'aisle_id', 
                        'department_id', 'aisle', 'eval_set', 'order_number', 
                        'order_dow', 'order_hour_of_day'], inplace=True)

In [9]:
baskets.shape

(32434489, 6)

In [10]:
baskets.product_name.nunique()

49677

In [11]:
baskets.department.unique()

array(['dairy eggs', 'produce', 'pantry', 'meat seafood', 'bakery',
       'personal care', 'snacks', 'breakfast', 'beverages', 'deli',
       'household', 'international', 'dry goods pasta', 'frozen',
       'canned goods', 'babies', 'pets', 'alcohol', 'bulk', 'missing',
       'other'], dtype=object)

In [12]:
baskets_food = baskets.loc[(baskets['department']!='personal care')&
                           (baskets['department']!='household')&
                           (baskets['department']!='babies')&
                           (baskets['department']!='pets')&
                           (baskets['department']!='other')&
                           (baskets['department']!='alcohol')&
                           (baskets['department']!='missing')&
                           (baskets['department']!='beverages')&
                           (baskets['department']!='snacks')]

In [13]:
baskets_food.department.unique()

array(['dairy eggs', 'produce', 'pantry', 'meat seafood', 'bakery',
       'breakfast', 'deli', 'international', 'dry goods pasta', 'frozen',
       'canned goods', 'bulk'], dtype=object)

In [14]:
baskets_food.product_name.nunique()

24495

In [15]:
baskets_food.drop(columns=['product_id', 'days_since_prior_order', 'department'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
baskets_food.head()

Unnamed: 0,order_id,product_name,user_id
0,2,Organic Egg Whites,202279
1,2,Michigan Organic Kale,202279
2,2,Garlic Powder,202279
3,2,Coconut Butter,202279
4,2,Natural Sweetener,202279


In [17]:
baskets_food['all_ones'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
# baskets_food.to_csv('../../data/02_intermediate/baskets_spark.csv')

## Make our Association Rules

### First let's make a subsample of data

In [18]:
baskets_food.head()

Unnamed: 0,order_id,product_name,user_id,all_ones
0,2,Organic Egg Whites,202279,1
1,2,Michigan Organic Kale,202279,1
2,2,Garlic Powder,202279,1
3,2,Coconut Butter,202279,1
4,2,Natural Sweetener,202279,1


In [19]:
baskets_food.order_id.nunique()

3048137

In [20]:
insta_order_lst = list(baskets_food.order_id.unique())

In [22]:
random_usrids_100k = random.sample(insta_order_lst, 100000)

In [23]:
mask = baskets_food['user_id'].isin(random_usrids_100k)

In [24]:
baskets_100k = baskets_food.loc[mask]

In [25]:
baskets_100k.drop(columns=['user_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
baskets_100k.reset_index(inplace=True)

break things up into 10k different products

In [27]:
product_list = list(baskets_100k.product_name.unique())

In [29]:
len(product_list)

16994

In [30]:
product_list_1 = product_list[0:10000]

In [31]:
len(product_list)

16994

In [32]:
mask_prod1 = baskets_100k['product_name'].isin(product_list_1)

In [33]:
baskets_prod1 = baskets_100k.loc[mask_prod1]
baskets_prod1.product_name.nunique()

10000

In [34]:
basket_matrix_1 = (baskets_prod1.groupby(['order_id', 'product_name'])['all_ones']
          .sum().unstack().reset_index().fillna(0)
          .set_index('order_id'))

In [37]:
product_list_2 = product_list[10000:]
mask_prod2 = baskets_100k['product_name'].isin(product_list_2)
baskets_prod2 = baskets_100k.loc[mask_prod2]
# pivot the dataset
basket_matrix_2 = (baskets_prod2.groupby(['order_id', 'product_name'])['all_ones']
          .sum().unstack().reset_index().fillna(0)
          .set_index('order_id'))

In [38]:
basket_matrix_order = basket_matrix_1.merge(basket_matrix_2, 
                      how='outer', 
                      on='order_id')

In [40]:
basket_matrix_order.replace(np.nan, 0, inplace=True)

## Make our Association Rules

In [None]:
# df['product_name'] = df['product_name'].str.strip()

In [None]:
# df['order_id'] = df['order_id'].astype('str')

In [None]:
# basket = (df.groupby(['order_id', 'product_name'])['all_ones']
#           .sum().unstack().reset_index().fillna(0)
#           .set_index('order_id'))

In [None]:
# basket.head()

In [None]:
# Convert the units to 1 hot encoded values
# def encode_units(x):
#     if x <= 0:
#         return 0
#     if x >= 1:
#         return 1
    
# basket_sets = basket.applymap(encode_units)

In [None]:
# basket_sets.head()

In [49]:
frequent_itemsets = apriori(basket_matrix_order, min_support=0.01, use_colnames=True)

In [50]:
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.019862,(100% Whole Wheat Bread)
1,0.011423,(2% Reduced Fat Milk)
2,0.027049,(Apple Honeycrisp Organic)
3,0.024327,(Asparagus)
4,0.121788,(Bag of Organic Bananas)


In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

**Sources**: 

1. How to build your own algorithm: https://surprise.readthedocs.io/en/stable/building_custom_algo.html
1. Association Rule Wikipedia: https://en.wikipedia.org/wiki/Association_rule_learning
1. Rule-based collaborative filtering: Recommendor Systems: The Textbook (pg. 160) 