In [1]:
import pandas as pd
import sqlalchemy as sql
import matplotlib.pyplot as plt
%matplotlib notebook

from tqdm import tqdm

# Data Dictionary
- remember there is no data for last order per customer

#### orders (415k rows): fact table, one record per order, includes useful time dimensions
- order_id: order identifier
- user_id: customer identifier
- order_number: the order sequence number for this user (1 = first, n = nth)
- order_dow: the day of the week the order was placed on
- Order_hour_of_day: hour of day the order was placed
- Days_since_prior_order: Number of days since that customer placed their previous order (NA for order_number = 1)

#### order_products (3.9M rows): fact table, one record per order, per product
_except for last order by a customer_
- order_id: foreign key
- product_id: foreign key
- add_to_cart_order: order in which each product was added to cart
- reordered: 1 if this product has been ordered by this user in the past, 0 otherwise

#### products (50k rows): Dimension table for product info
- product_id: product identifier
- product_name: name of the product
- aisle_id: foreign key
- department_id: foreign key

#### aisles (134 rows): Dimension table for aisle info
- aisle_id: aisle identifier
- aisle: the name of the aisle

#### departments (21 rows): Dimension table for department info
- department_id: department identifier
- department: the name of the department


In [2]:
engine = sql.create_engine('sqlite:///instacart.db')
db = {}

In [3]:
for table in ['orders', 'order_products', 'products', 'aisles', 'departments',]:
    db[table] = pd.read_sql_table(table, engine, index_col=0,)

In [4]:
# aliases
orders = db['orders']
orders.set_index('order_id', inplace=True)

ops = db['order_products']   # default index vs. multi-index on order_id + add_to_cart_order

products = db['products']
del(products['index'])       # index is one off from xxxx_id, confusing
products.set_index('product_id', inplace=True)

del(db['departments']['index'])
db['departments'].set_index('department_id', inplace=True)

del(db['aisles']['index'])
db['aisles'].set_index('aisle_id', inplace=True)

In [5]:
products_by_aisle = products.groupby('aisle_id')
products_by_dept = products.groupby('department_id')
get_dept_from_aisle = products_by_aisle['department_id'].unique()

In [6]:
get_aisles_from_dept = products_by_dept['aisle_id'].unique()

In [7]:
# confirm correspondence:
dept = 1
for aisle in get_aisles_from_dept[dept]:
    assert get_dept_from_aisle[aisle] == [dept]

In [8]:
len(orders)

414772

In [9]:
orders.index[14]  # no data in ops

1492625

In [12]:
orders.head(16)

Unnamed: 0_level_0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2168274,2,1,2,11,
1501582,2,2,5,10,10.0
1901567,2,3,1,10,3.0
738281,2,4,2,10,8.0
1673511,2,5,3,11,8.0
1199898,2,6,2,9,13.0
3194192,2,7,2,12,14.0
788338,2,8,1,15,27.0
1718559,2,9,2,9,8.0
1447487,2,10,1,11,6.0


# Product segmentation

### Can products be separated into groups?
- the store they come from
- the customers who buy them
- properties? i.e. name, department, aisle?
- the products that they are co-ordered with

Hypothesis 1:
Orders come from discrete retailers; product assortment will be largely or entirely separate depending on store.

Test method:
Build a graph of all products. Products will be connected to other products that they share an order with, this will determine whether there are entirely disjoint sections of product graph

In [13]:
ops = pd.merge(ops, products, left_on='product_id', right_index=True).sort_values('order_id')   
# add product details to order_products: name, aisle_id, department_id 

In [14]:
ops.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
8,2,43668,9,0,Classic Blend Cole Slaw,123,4
7,2,1819,8,1,All Natural No Stir Creamy Almond Butter,88,13
5,2,17794,6,1,Carrots,83,4
4,2,30035,5,0,Natural Sweetener,17,13


In [15]:
# note for me - series.value_counts() is nice
ops.groupby('product_name').size().sort_values(ascending=False)[:10]
ops.product_name.value_counts()[:10]

Banana                    58231
Bag of Organic Bananas    45699
Organic Strawberries      32203
Organic Baby Spinach      29380
Organic Hass Avocado      25958
Organic Avocado           21642
Large Lemon               18581
Limes                     17399
Strawberries              17217
Organic Whole Milk        16468
Name: product_name, dtype: int64

In [16]:
real_products = set(ops.product_id.unique())   # there are products in db['products'] that are not in any orders
len(real_products)

42814

In [17]:
products['product_id'] = products.index.values  # can't apply function to index, so duplicate it

In [18]:
products_in_order = ops.groupby('order_id')['product_id'].unique()  # takes ~1 min to build

In [19]:
ops.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
8,2,43668,9,0,Classic Blend Cole Slaw,123,4
7,2,1819,8,1,All Natural No Stir Creamy Almond Butter,88,13
5,2,17794,6,1,Carrots,83,4
4,2,30035,5,0,Natural Sweetener,17,13


In [23]:
from collections import defaultdict

'''
ProductGraph provides a measure of distance between products based on number of
orders in which A,B overlap.
Note that this is NOT a measure of similarity, i.e. bananas and bag-of-bananas are
likely far apart.
Low distance <-> high likelihood of overlap
'''

class ProductGraph:
    def __init__(self, order_records):
        # order_records[i] => {items in order i}
        # i.e. a groupby object
        self.records = order_records.groupby('order_id')['product_id'].unique()
        
        # graph is from a product to other products that are connected by an order
        # graph[A] = {B: number of orders containing both A & B}
        self.graph = {i: defaultdict(int) for i in order_records.product_id.unique()}
        
        # order_records.product_id.value_counts()[]
        self.orders_for_product = order_records.product_id.value_counts()
        
    def add_order_items(self, order_items):
        # add a collection of items to network;
        # each connection is incremented by one
        # order n^2 but only has to run once...
        for active_item in order_items:
            for second_item in [item for item in order_items if item != active_item]:
                self.graph[active_item][second_item] += 1
                
    def add_order_number(self, order_number):
        order_items = self.records[order_number]
        self.add_order_items(order_items)
        
    def build_graph(self):
        for order_no in tqdm(self.records.index):
            self.add_order_number(order_no)
            
    def find_all_connections_from(self, start_node):
        # BFS on graph to find indirectly-connected products
        q = []
        q.append(start_node)
        visited = set()
        while q:
            node = q.pop()  # node we are on
            if node in visited:
                continue
            else:
                visited.add(node)
                for key in self.graph[node].keys():
                    q.append(key)
        return visited
        
    def distance(self, node_1, node_2, oddity=.75):
        # distance from 1 to 2, with oddity factor so it's not all bananas
        # increasing oddity devalues popular products
        connections = self.graph[node_1][node_2]
        if node_1 == node_2:
            return 0
        elif connections:
            pop = self.orders_for_product[node_2]**oddity
            return pop / connections
        else:
            return None

In [24]:
g = ProductGraph(ops)  # slow-ish, runs 2 groupby operations 

In [25]:
g.build_graph()

100%|██████████| 389772/389772 [01:00<00:00, 6481.55it/s]


In [29]:
connected_to_ = g.find_all_connections_from(2)

In [30]:
print(len(connected_to_))
# fully connected graph except for products only purchased alone

42783


### Hypothesis - discrete retailers:

False. Graph of products is fully connected (at some depth), except for products that have only been ordered singly.

In [31]:
singletons = [p for p in real_products if p not in connected_to_]

In [32]:
len(singletons)

31

In [34]:
connected_to_ = None
products.loc[singletons[:10]]  # weirdo product

Unnamed: 0_level_0,product_name,aisle_id,department_id,product_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1470,Elastic Bandage With Clips for Customized Comp...,118,11,1470
1533,Organic 100% Juice Variety,98,7,1533
3058,Sunscreen Lotion Face Oil-Free SPF 70+,73,11,3058
4882,Tape,6,2,4882
6206,Calcium Magnesium Zinc Tablets,47,11,6206
10381,Chocolate Mousse Cake,8,3,10381
10382,Spirulina Pacifica Powder,47,11,10382
10870,Blue Label Year of the Ram,124,5,10870
12698,Stress B-Complex with Vitamin C & Zinc,47,11,12698
13185,"Vitamin C, 500 mg, Chewable Wafers, Mixed Fruit",47,11,13185


### Now we have something like a distance metric

So, we can pick out the closest product to a basket. This may be a reasonable guess for something a customer would buy.

### Hypothesis:

The value of recommending a product is inversely proportional to the popularity of that product to some power <= 1.

Bananas will often have the highest number of shared orders with a product, but it seems that bananas are not the best recommendation to make in all cases.

Consider niche product X.
Of a million people who bought bananas, some will buy X. This reflects more on the ubiquity of bananas than a strong affinity of banana buyers to product X.

Although a percentage of buyers of X will go on to buy bananas, there are probably higher-value recommendations available.

In [35]:
ops.product_id.value_counts()[:3]   # ID, number of orders

24852    58231
13176    45699
21137    32203
Name: product_id, dtype: int64

In [36]:
banana = 24852
org_bananas = 13176
org_strawberries = 21137
almond_milk = 432
products.product_name.loc[almond_milk]

'Vanilla Almond Breeze Almond Milk'

In [37]:
g.distance(org_strawberries, org_bananas)    # .15

7.353984770169453e-05

In [38]:
g.distance(org_strawberries, banana)    # .12

5.8832682369398116e-05

In [42]:
g.distance(org_bananas, almond_milk)

2.108858601475143e-05

In [43]:
g.distance(banana, almond_milk)

3.58127095133025e-05

In [None]:
top_ten_k = ops.product_id.value_counts().keys()[:10000]  # 10,000 most popular items

In [None]:
def recommend(order_items, dist=g.distance, options=top_ten_k, oddity=.75):
    '''
    take a list of order items; compute distance using ProductGraph.distance
    return most likely additions from top 1000 products
    '''
    best_item = None
    best_score = 1000000
    for option in options:
        option_score = 0
        for my_item in order_items:
            if my_item == option:
                break
            else:
                score = dist(my_item, option)
                if score:
                    option_score += score  # arbitrary similarity metric discounts popular products
        if option_score > best_score:
            best_score = option_score
            best_item = option
    return best_item, best_score

In [None]:
recommend(products_in_order[2])

In [None]:
def generate_recs(i):
    try:
        basket = products_in_order[orders.index[i]]
        last = basket[-1]
        basket = basket[:-1]
    except KeyError:
        print('order_id {} exists in orders.index, but contains no items. \n'.format(orders.index[i]))
        return None
    
    print('\n' + '+ ' * 30 + '\n')
    print(products.loc[basket].product_name)

    for oddity in [0, .5, .7, 1, 1.5]:
        print('\n oddity factor: {}'.format(oddity))
        item, score = recommend(basket, oddity=oddity)
        if item:
            print(products.loc[item].product_name)
        
    print()
    print('actual final item: {}'.format(products.loc[last].product_name))
        
    print()
    return item

In [None]:
for _ in range(3):
    o = pd.np.random.randint(0, 100000)
    generate_recs(o)

# Hypothesis:

There is some support for an inverse factor for item popularity.