In [1]:
import pandas as pd
import sqlalchemy as sql
import matplotlib.pyplot as plt
%matplotlib

from tqdm import tqdm

Using matplotlib backend: TkAgg


# Build a recommender using PCA tools

Mostly to see how they work

In [7]:
pca_dimension = 16   # number of coordinates generated previously

In [2]:
engine = sql.create_engine('sqlite:///instacart.db')
db = {}

In [3]:
for table in ['order_products', 'departments']:
    db[table] = pd.read_sql_table(table, engine, index_col=0,)

In [4]:
orders = db['order_products'].groupby('order_id')['product_id'].unique()   # orders = list of products in each order

In [8]:
pca_frame = pd.read_csv('product_pca_coords.csv', index_col=0)
pca_frame.values[:, :pca_dimension] = pca_frame.values[:, :pca_dimension].astype('float')

In [9]:
def vector_best_distance(product_id, output='scores'):
    try:
        print('best match for:', pca_frame.product_name.loc[product_id])
        my_coords = pca_frame.loc[product_id].values[:pca_dimension].astype('float32')
        actual_index = pca_frame.index.get_loc(product_id)
        # print(actual_index)
    except KeyError:
        print('invalid item')
        return None
    
    diff = pca_frame.values[:, :pca_dimension].astype('float32') - my_coords
    # print(diff.shape, diff.dtype)
    scores = pd.np.linalg.norm(diff, axis=1)
    scores[actual_index] = pd.np.Inf
    # print(scores[:5])

    best = pd.np.argmin(scores)
    if output == 'scores':
        print(pca_frame.product_name.iloc[best])
        return scores
    else:
        return best, pca_frame.product_name.iloc[best], pca_frame.index[best], pd.np.min(scores)

In [11]:
def vector_distance(product_id, target_id):
    try:
        print('distance from:', pca_frame.product_name.loc[product_id], ' to ', pca_frame.product_name.loc[target_id])
        my_coords = pca_frame.loc[product_id].values[:pca_dimension].astype('float32')
        target = pca_frame.loc[target_id].values[:pca_dimension].astype('float32')
        # print(actual_index)
    except KeyError:
        print('invalid item')
        return None
    
    return pd.np.linalg.norm(my_coords - target)

In [47]:
def multi_vector_best_distance(basket):
    point_cloud = pd.np.empty((len(basket), pca_dimension), dtype='float32')
    indices = []
    row = 0
    for product_id in basket:
        try:
            print('best match for:', pca_frame.product_name.loc[product_id])
            coords = pca_frame.loc[product_id].values[:pca_dimension].astype('float32')
            point_cloud[row] = coords
            indices.append(pca_frame.index.get_loc(product_id))
            # print(actual_index)
            row += 1
        except KeyError:
            print('invalid item')
            return None
    
    my_coords = pd.np.mean(point_cloud, axis=0)
    print(my_coords.shape)

    diff = pca_frame.values[:, :pca_dimension].astype('float32') - my_coords
    # print(diff.shape, diff.dtype)
    scores = pd.np.linalg.norm(diff, axis=1)
    for i in indices:
        scores[i] = pd.np.Inf
    # print(scores[:5])

    best = pd.np.argmin(scores)

    return best, pca_frame.product_name.iloc[best], pca_frame.index[best], pd.np.min(scores)

In [38]:
vector_best_distance(1, output=False)

best match for: Chocolate Sandwich Cookies


(26437, 'Veggie Cheese & Pretzel Snack Pac Prepacked', 30633, 1.316407)

In [39]:
pca_frame.index[best]vector_distance(1, 32)

distance from: Chocolate Sandwich Cookies  to  Nacho Cheese White Bean Chips


12.406472

In [41]:
multi_vector_best_distance([1, 2])

best match for: Chocolate Sandwich Cookies
best match for: All-Seasons Salt
(16,)


(2100, 'Peppermint Gum', 2458, 1.294741)

In [44]:
l = []
for i in pca_frame.index.values[:10]:
    l.append(i)
    print(multi_vector_best_distance(l))
    print()

best match for: Chocolate Sandwich Cookies
(16,)
(26437, 'Veggie Cheese & Pretzel Snack Pac Prepacked', 30633, 1.316407)

best match for: Chocolate Sandwich Cookies
best match for: All-Seasons Salt
(16,)
(2100, 'Peppermint Gum', 2458, 1.294741)

best match for: Chocolate Sandwich Cookies
best match for: All-Seasons Salt
best match for: Robust Golden Unsweetened Oolong Tea
(16,)
(39674, 'Double Chocolate Muffins', 46065, 1.6965892)

best match for: Chocolate Sandwich Cookies
best match for: All-Seasons Salt
best match for: Robust Golden Unsweetened Oolong Tea
best match for: Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce
(16,)
(25659, 'Caffeine Free Diet Coke', 29735, 1.8516276)

best match for: Chocolate Sandwich Cookies
best match for: All-Seasons Salt
best match for: Robust Golden Unsweetened Oolong Tea
best match for: Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce
best match for: Green Chile Anytime Sauce
(16,)
(31644, 'Organic Agave Nectar', 36

In [52]:
def generate_recs(i):
    try:
        basket = orders.iloc[i]
        actual_last = basket[-1]
        basket = basket[:-1]
    except KeyError:
        print('order_id {} may exist in orders.index, but contains no items. \n'.format(orders.index[i]))
        return None
    
    print('\n' + '+ ' * 30 + '\n')
    print(pca_frame.loc[basket].product_name)

    ind, name, product_id, score = multi_vector_best_distance(basket)
    
    print()
    print('we chose:')
    print(name)

    print()
    print('actual final item:')
    print(pca_frame.loc[actual_last].product_name)
    
    print()
    print('distance between items:', vector_distance(product_id, actual_last))
        
    print()

In [53]:
for _ in range(20):
    generate_recs(pd.np.random.randint(0, 10000))


+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 

product_id
19660                    Spring Water
22035     Organic Whole String Cheese
24852                          Banana
18918         Organic Chopped Spinach
18224    Organic Low Fat Plain Yogurt
42450               Macaroni & Cheese
Name: product_name, dtype: object
best match for: Spring Water
best match for: Organic Whole String Cheese
best match for: Banana
best match for: Organic Chopped Spinach
best match for: Organic Low Fat Plain Yogurt
best match for: Macaroni & Cheese
(16,)

we chose:
I Heart Baby Kale

actual final item:
Organic Whole Milk

distance from: I Heart Baby Kale  to  Organic Whole Milk
distance between items: 5.478399


+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 

product_id
48223                                 Peanut Oil
17862                              Minced Garlic
16726                    Roasted Red Chili Paste
49128       Organic Tamari Gluten-Free Soy Sauce
30196        


we chose:
Sparkling Blackberry Juice

actual final item:
Unsalted Butter

distance from: Sparkling Blackberry Juice  to  Unsalted Butter
distance between items: 8.689316


+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 

product_id
47977                           Grapefruit Sparkling Water
29616    Direct Trade Black Cat Classic Espresso Roast ...
15925                                  Macaroni and Cheese
42719                        Natural Premium Coconut Water
8022                                         Spinach Pizza
34862                                 Tiny Twists Pretzels
27284                 Organic Fruit and Nut Granola Cereal
39275                                  Organic Blueberries
5322     Gluten Free Dark Chocolate Chunk Chewy with a ...
Name: product_name, dtype: object
best match for: Grapefruit Sparkling Water
best match for: Direct Trade Black Cat Classic Espresso Roast Whole Bean Coffee
best match for: Macaroni and Cheese
best match for: Natural Premiu

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  return umr_minimum(a, axis, None, out, keepdims)



we chose:
Chocolate Sandwich Cookies

actual final item:
Raspberries

distance from: Chocolate Sandwich Cookies  to  Raspberries
distance between items: 29.384392


+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 

product_id
7035                  Peanut Butter Ice Cream Cup
22828                  Sea Salt Caramel Ice Cream
9741                                 Cheese Pizza
24799                  Vanilla Skyr Nonfat Yogurt
8309     Nonfat Icelandic Style Strawberry Yogurt
23794                           1% Chocolate Milk
40486                             Chicken Tenders
Name: product_name, dtype: object
best match for: Peanut Butter Ice Cream Cup
best match for: Sea Salt Caramel Ice Cream
best match for: Cheese Pizza
best match for: Vanilla Skyr Nonfat Yogurt
best match for: Nonfat Icelandic Style Strawberry Yogurt
best match for: 1% Chocolate Milk
best match for: Chicken Tenders
(16,)

we chose:
Olive Oil & Vinegar Dressing

actual final item:
Honey & Maple Turkey Breast



# Recommender works... ok!

Recommendations are generally reasonable and fall in the space of products that these consumers may purchse.

### Things to improve:

This system only uses our PCA distance metric. It could definitely be improved by using other available data:
0. The way distances are dealt with here is dumb! Average vs. cluster
1. Department/Aisle info
2. Product add-to-order sequence data
3. Time of day/week information
4. (customer purchase history if this was for real deployment)
5. Text from product description
6. Most of product graph was thrown out (for RAM / time / complexity)
7. Internal parameters (# dimensions, 'oddity') have not been tuned

# Ultimately, this system is only measuring fake distances

It could be more nuanced I suppose.