# Import

In [912]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from collections import Counter
import time
from random import choices
from scipy.sparse import csr_matrix
from sklearn.externals import joblib
from pickle import dump

# Data

## Load

In [305]:
order_products__train = pd.read_csv("../instacart_2017_05_01/order_products__train.csv")
order_products__prior = pd.read_csv("../instacart_2017_05_01/order_products__prior.csv")
products = pd.read_csv("../instacart_2017_05_01/products.csv")
orders = pd.read_csv("../instacart_2017_05_01/orders.csv")

## Train

In [306]:
orders_train = orders[orders['eval_set'] == 'train'].reset_index()
orders_train = orders_train[['user_id', 'order_id']]

In [307]:
order_products__train = order_products__train[['order_id', 'product_id', 'reordered']]
order_products__train = order_products__train.groupby('order_id')['product_id'].apply(list).reset_index()
order_products__train.head()

Unnamed: 0,order_id,product_id
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [308]:
user_products__train = pd.merge(order_products__train, orders_train, on='order_id')
user_products__train = user_products__train[['user_id', 'product_id']]

In [309]:
user_products__train[user_products__train.duplicated(['user_id'], keep=False)]

Unnamed: 0,user_id,product_id


In [310]:
user_products__train.set_index('user_id', inplace=True)

In [311]:
user_products__train.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
112108,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
79431,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
42756,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
17227,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
56463,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


### Save/Load dataframes

In [52]:
user_products__train.to_csv("user_products__train.csv")

In [53]:
try:
    user_products__train
except NameError:
    user_products__train = pd.read_csv("user_products__train.csv")
else:
    print("user_products__train already exists")

user_products__train already exists


In [176]:
test_subjects = user_products__train.index

## Prior

In [6]:
orders_prior = orders[orders['eval_set'] == 'prior'].reset_index()
orders_prior = orders_prior[['user_id', 'order_id']]

In [7]:
order_products__prior = order_products__prior[['order_id', 'product_id']]

In [14]:
user_products__prior = pd.merge(orders_prior, order_products__prior, on='order_id', how='inner')
user_products__prior = user_products__prior[['user_id', 'product_id']]
user_products__prior = user_products__prior.groupby('user_id')['product_id'].apply(list).reset_index()
user_products__prior.head()

Unnamed: 0,user_id,product_id
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [312]:
prior_orders_lookup = user_products__prior.set_index('user_id')

### Save/Load dataframes

In [16]:
try:
    user_products__prior
except NameError:
    user_products__prior = pd.read_csv('user_products__prior.csv')
else:
    print("user_products__prior already exists")

user_products__prior already exists


In [42]:
user_products_lookup = user_products__prior.set_index('user_id', drop=True)
user_products_lookup.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [920]:
user_products_lookup.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [921]:
# Dump for program
user_products_lookup.to_csv('user_products_lookup.csv')
testdf = pd.read_csv('user_products_lookup.csv').set_index('user_id')
testdf.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [949]:
selectiontest = testdf.loc[[1,4,5]]

In [950]:
selectiontest['product_id'] = selectiontest['product_id'].apply(literal_eval)

In [952]:
selectiontest.iloc[0]['product_id']

[196,
 14084,
 12427,
 26088,
 26405,
 196,
 10258,
 12427,
 13176,
 26088,
 13032,
 196,
 12427,
 10258,
 25133,
 30450,
 196,
 12427,
 10258,
 25133,
 26405,
 196,
 12427,
 10258,
 25133,
 10326,
 17122,
 41787,
 13176,
 196,
 12427,
 10258,
 25133,
 196,
 10258,
 12427,
 25133,
 13032,
 12427,
 196,
 10258,
 25133,
 46149,
 49235,
 49235,
 46149,
 25133,
 196,
 10258,
 12427,
 196,
 46149,
 39657,
 38928,
 25133,
 10258,
 35951,
 13032,
 12427]

## Prior sentences

In [17]:
def make_sentences(product_id_list):
    string_list = [str(product_id) for product_id in product_id_list]
    return " ".join(string_list)

In [18]:
user_products_sentence__prior = user_products__prior.copy()
user_products_sentence__prior['product_id'] = user_products_sentence__prior['product_id'].apply(make_sentences)
user_products_sentence__prior.head()

Unnamed: 0,user_id,product_id
0,1,196 14084 12427 26088 26405 196 10258 12427 13...
1,2,32792 47766 20574 12000 48110 22474 16589 3591...
2,3,9387 17668 15143 16797 39190 47766 21903 39922...
3,4,36606 7350 35469 2707 42329 7160 1200 17769 43...
4,5,15349 21413 48775 28289 8518 11777 31717 26604...


In [917]:
user_products_sentence__prior.to_csv('user_products_sentence__prior.csv', index=False)

In [20]:
try:
    user_products__prior
except NameError:
    user_products_sentence__prior = pd.read_csv('user_products_sentence__prior.csv')
else:
    print("user_products_sentence__prior already exists")

user_products_sentence__prior already exists


In [936]:
# Dump for website

user_products_sentence__prior.drop(columns=['product_id']).to_csv('user_products_sentence__prior(web).csv', index=False)

In [937]:
testdf2 = pd.read_csv('user_products_sentence__prior(web).csv')
testdf2.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


# Model

In [804]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(user_products_sentence__prior['product_id'])

In [905]:
joblib.dump(tfidf, 'tfidf.pkl')
joblib.dump(tfidf_matrix, 'tfidx_matrix.pkl')

['tfidx_matrix.pkl']

In [33]:
# This will convert your user_ids to the matrix indices
indices = pd.Series(user_products_sentence__prior.index, index=user_products_sentence__prior['user_id']).drop_duplicates()

In [34]:
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Not enough memory for this unfortunately

In [None]:
# This approach also doesn't work due to memory issues

# # Function that takes in user_id as input and outputs most similar movies
# def get_recommendations(user_id, cosine_sim=cosine_sim, k):
#     # Get the index of the user that matches the user_id
#     idx = indices[user_id]

#     # Get the pairwsie similarity scores of all movies with that movie
#     sim_scores = list(enumerate(cosine_sim[idx]))

#     # Sort the users based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     # Get the scores of the k most similar movies, skip the first one since it will be itself
#     sim_scores = sim_scores[1:k]

#     # Get the user indices
#     user_indices = [i[0] for i in sim_scores]

#     # Return the top 10 most similar users
#     return user_products_sentence__prior['user_id'].iloc[user_indices]

In [897]:
# Function that takes in user_id as input and outputs recommended products
def get_recommendations(user_id, k, n=10, include_prev=True):
    # Get the index of the user that matches the user_id
    idx = indices[user_id]
    
    cosine_sim = linear_kernel(tfidf_matrix[idx], tfidf_matrix)[0]

    # Get the pairwsie similarity scores of all users
    sim_scores = list(enumerate(cosine_sim))

    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the k most similar users, skip the first one since it will be itself
    sim_scores = sim_scores[1:(k+1)]

    # Get the user indices
    user_indices = [i[0] for i in sim_scores]

    # The top k most similar users
    top_k_users = user_products_sentence__prior['user_id'].iloc[user_indices]
    
    # The products purchased from these top k users
    top_k_user_products = user_products_lookup.loc[top_k_users]
    product_list = [i for j in top_k_user_products['product_id'].tolist() for i in j]
    
    # Count the frequency of the items purchased and sort by popularity
    product_counts = dict(Counter(product_list))
    popular_products = sorted(product_counts, key=product_counts.get, reverse=True)
    
    if include_prev:
        # Return the top n purchaed product by the k most similar users
        return popular_products[:n]
    
    else:
        # In this case we filter out items the user has already purchased
        prev_purchases = get_previously_purchased_products(user_id)
        popular_new_products = [product for product in popular_products if product not in prev_purchases]
        
        return popular_new_products[:n]

In [250]:
# Recommener function will output product_ids. Use this dict to then get human names for food
product_id_to_name_dict = pd.Series(products['product_name'].values, index=products['product_id']).to_dict()

def product_id_to_name(product_ids):
    return [product_id_to_name_dict[id] for id in product_ids]

In [915]:
# Export this for website
dump(product_id_to_name_dict, open('product_id_to_name_dict.pickle', 'wb'))

In [343]:
def get_products_really_purcased(user_id, include_prev=True):
    purchased_products = user_products__train.loc[user_id].tolist()[0]
    if include_prev:
        return purchased_products
    else:
        prev_purchase = get_previously_purchased_products(user_id)
        
        return [product for product in purchased_products if product not in prev_purchase]

In [313]:
def get_previously_purchased_products(user_id):
    return prior_orders_lookup.loc[user_id].tolist()[0]

## Example

In [319]:
product_id_to_name(get_previously_purchased_products(1))

['Soda',
 'Organic Unsweetened Vanilla Almond Milk',
 'Original Beef Jerky',
 'Aged White Cheddar Popcorn',
 'XL Pick-A-Size Paper Towel Rolls',
 'Soda',
 'Pistachios',
 'Original Beef Jerky',
 'Bag of Organic Bananas',
 'Aged White Cheddar Popcorn',
 'Cinnamon Toast Crunch',
 'Soda',
 'Original Beef Jerky',
 'Pistachios',
 'Organic String Cheese',
 'Creamy Almond Butter',
 'Soda',
 'Original Beef Jerky',
 'Pistachios',
 'Organic String Cheese',
 'XL Pick-A-Size Paper Towel Rolls',
 'Soda',
 'Original Beef Jerky',
 'Pistachios',
 'Organic String Cheese',
 'Organic Fuji Apples',
 'Honeycrisp Apples',
 'Bartlett Pears',
 'Bag of Organic Bananas',
 'Soda',
 'Original Beef Jerky',
 'Pistachios',
 'Organic String Cheese',
 'Soda',
 'Pistachios',
 'Original Beef Jerky',
 'Organic String Cheese',
 'Cinnamon Toast Crunch',
 'Original Beef Jerky',
 'Soda',
 'Pistachios',
 'Organic String Cheese',
 'Zero Calorie Cola',
 'Organic Half & Half',
 'Organic Half & Half',
 'Zero Calorie Cola',
 'Organ

In [876]:
product_id_to_name(get_recommendations(1, 200))

['Soda',
 'Original Beef Jerky',
 'Zero Calorie Cola',
 'Bag of Organic Bananas',
 'Pistachios',
 'Trail Mix',
 'Clementines',
 '0% Greek Strained Yogurt',
 'Almonds',
 'Extra Fancy Unsalted Mixed Nuts']

In [877]:
product_id_to_name(get_recommendations(1, 200, include_prev=False))

['Trail Mix',
 'Clementines',
 'Almonds',
 'Extra Fancy Unsalted Mixed Nuts',
 "Crunchy Oats 'n Honey Granola Bars",
 'Mineral Water',
 'Mixed Fruit Fruit Snacks',
 'Dried Mangos',
 'Hass Avocados',
 'Mozzarella String Cheese']

Takes around a second for each prediction—which is very long when I think of all the test data I have

In [875]:
product_id_to_name(get_products_really_purcased(1))

['Soda',
 'Organic String Cheese',
 '0% Greek Strained Yogurt',
 'XL Pick-A-Size Paper Towel Rolls',
 'Milk Chocolate Almonds',
 'Pistachios',
 'Cinnamon Toast Crunch',
 'Aged White Cheddar Popcorn',
 'Organic Whole Milk',
 'Organic Half & Half',
 'Zero Calorie Cola']

# Hyperparameter optimization

In [346]:
def get_model_accuracy(user_ids, k, n=10, include_prev=True):
    accuracies = []
    for user_id in user_ids:
        recommended_items = set(get_recommendations(user_id, k, n, include_prev))
        real_items = set(get_products_really_purcased(user_id, include_prev))
        if len(real_items) != 0:
            accuracy = len(recommended_items.intersection(real_items)) / len(real_items)
            accuracies.append(accuracy)
    model_accuracy = np.mean(accuracies)
    return model_accuracy

## Including previous purchases

In [None]:
# While it would be nice to do this, it simply takes too long
k_values = [50, 100, 200, 300]

accuracies = {}
for k in k_values:
    accuracy = get_model_accuracy(test_subjects, k, n=10)
    accuracies[k] = accuracy
    print(f"{k}: {accuracy * 100}%")

In [290]:
random_users = choices(test_subjects, k=10000)

In [None]:
start = time.time()
k_values = [50, 100, 200]

accuracies = {}
for k in k_values:
    accuracy = get_model_accuracy(random_users, k, n=10)
    print(f"{k}: {accuracy * 100}%")
    accuracies[k] = accuracy
print(f"This took {time.time() - start}s")

In [292]:
accuracies

{50: 0.2207089333842549, 100: 0.22730919134017813, 200: 0.22774171324152867}

{50: 0.2207089333842549, 100: 0.22730919134017813, 200: 0.22774171324152867}

In [294]:
random_users2 = choices(test_subjects, k=10000)

I used another random sampling to see by around how much these accuracies vary depending on the sampling

50: 21.91890341054359%
This took 9648.1039788723s
100: 22.69994431324306%
This took 9667.498548269272s

In [None]:
k_values = [10, 25, 50]

accuracies = {}
for k in k_values:
    start = time.time()
    accuracy = get_model_accuracy(random_users2, k, n=10)
    print(f"{k}: {accuracy * 100}%")
    accuracies[k] = accuracy
    print(f"This took {time.time() - start}s")

10: 18.32212244437322%
This took 9851.811056137085s


In [298]:
accuracies

{10: 0.1832212244437322, 25: 0.20579167290986902, 50: 0.2191890341054359}

{10: 0.1832212244437322, 25: 0.20579167290986902, 50: 0.2191890341054359}

## Only new purchases

In [None]:
k_valuse = [50, 100, 300]

accuracies = {}
for k in k_values:
    start = time.time()
    accuracy = get_model_accuracy(random_users2, k, n=10, include_prev=False)
    print(f"{k}: {accuracy * 100}%")
    accuracies[k] = accuracy
    print(f"This took {time.time() - start}s")

# Recommendation engine

## Sparse matrix constructor

In [851]:
def make_sparse(user_selection_sentence, tfidf_model=tfidf):
    # Input is "sentence" of a string of product ids. First convert to a list of string product ids
    selected_products = (user_selection_sentence.split())
    unique_products = list(set(selected_products))
    
    # Now make this into a sparse matrix
    
    # Make matrix coordinates
    sparse_row = [0 for product in unique_products] # Just one row—so all values 0
    product_col = {} # Take coordinates from previous product vectorization
    for product in unique_products:
        product_col[product] = tfidf_model.vocabulary_[product]
    
    # Make a dictionary to look up established idf weights by their terms
    idf_weights = dict(zip(tfidf_model.get_feature_names(),tfidf_model.idf_))
    
    # Get data from above dict pased on user selected products
    product_weight = {}
    for product in selected_products:
        if product in product_weight:
            product_weight[product] += idf_weights[product]
        else:
            product_weight[product] = idf_weights[product]
    
    # Get eveything in the same order
    sparse_col = []
    sparse_data = []
    for product in unique_products:
        sparse_col.append(product_col[product])
        sparse_data.append(product_weight[product])
    
    
    vector_mag = np.linalg.norm(sparse_data) # Normalize data
    normed_sparse_data = [weight / vector_mag for weight in sparse_data]
    
    
    # Construct matrix
    sparse_mtrx = csr_matrix((normed_sparse_data, (sparse_row, sparse_col)), shape=(1, len(tfidf_model.get_feature_names())))
    return sparse_mtrx

### Example

In [852]:
test_sparse = make_sparse(user_selection[0])

In [853]:
print(test_sparse)

  (0, 288)	0.5568375119899436
  (0, 364)	0.050791457361779715
  (0, 2698)	0.5200530407919848
  (0, 3371)	0.1631715705027685
  (0, 3530)	0.03897656945587028
  (0, 4539)	0.04677673659345217
  (0, 7915)	0.04317823250405664
  (0, 10666)	0.3803507238381845
  (0, 16815)	0.41904552560272096
  (0, 17874)	0.11057929962464196
  (0, 18227)	0.12022824569228034
  (0, 22720)	0.03667593396478088
  (0, 28831)	0.03301374192895708
  (0, 32134)	0.04926795594368525
  (0, 32944)	0.05521308097106601
  (0, 35310)	0.0341141619775235
  (0, 40155)	0.1572255428828079
  (0, 43583)	0.06290912449056552


In [843]:
print(tfidf_matrix[indices[1]])

  (0, 10666)	0.3803507238381845
  (0, 4539)	0.046776736593452176
  (0, 2698)	0.5200530407919849
  (0, 17874)	0.11057929962464197
  (0, 18227)	0.12022824569228036
  (0, 288)	0.5568375119899436
  (0, 3530)	0.03897656945587029
  (0, 3371)	0.16317157050276854
  (0, 16815)	0.419045525602721
  (0, 22720)	0.03667593396478088
  (0, 364)	0.050791457361779715
  (0, 7915)	0.04317823250405664
  (0, 35310)	0.0341141619775235
  (0, 40155)	0.1572255428828079
  (0, 43583)	0.06290912449056552
  (0, 32944)	0.05521308097106602
  (0, 32134)	0.04926795594368526
  (0, 28831)	0.03301374192895708


## Recommender

In [898]:
# Function that takes in products a user likes and outputs recommended products
def get_recommended_products(user_selection_sentence, k, n=10, include_prev=False, tfidf_model=tfidf, debug=False):
    
    product_mtrx = make_sparse(user_selection_sentence, tfidf_model=tfidf)
    cosine_sim = linear_kernel(product_mtrx, tfidf_matrix)[0]

    # Get the pairwsie similarity scores of all users
    sim_scores = list(enumerate(cosine_sim))

    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top k scores
    if debug:
        sim_scores = sim_scores[1:(k+1)] # In debug mode I'll want to skip the best match since it will just be the input user
    else:
        sim_scores = sim_scores[:k]

    # Get the user indices
    user_indices = [i[0] for i in sim_scores]

    # The top k most similar users
    top_k_users = user_products_sentence__prior['user_id'].iloc[user_indices]
    
    # The products purchased from these top k users
    top_k_user_products = user_products_lookup.loc[top_k_users]
    product_list = [i for j in top_k_user_products['product_id'].tolist() for i in j]
    
    # Count the frequency of the items purchased and sort by popularity
    product_counts = dict(Counter(product_list))
    popular_products = sorted(product_counts, key=product_counts.get, reverse=True)
    
    if include_prev:
        # Return the top n purchaed product by the k most similar users
        return popular_products[:n]
    
    else:
        # In this case we filter out items the user has already purchased
        prev_purchases = user_selection_sentence.split()
        prev_purchases = list(map(int, prev_purchases))
        popular_new_products = [product for product in popular_products if product not in prev_purchases]
        
        return popular_new_products[:n]

### Example

In [900]:
product_id_to_name(get_recommended_products(user_selection[0], 200, include_prev=True, debug=True))

['Soda',
 'Original Beef Jerky',
 'Zero Calorie Cola',
 'Bag of Organic Bananas',
 'Pistachios',
 'Trail Mix',
 'Clementines',
 '0% Greek Strained Yogurt',
 'Almonds',
 'Extra Fancy Unsalted Mixed Nuts']

Actual recommended items from prev model

['Soda',
 'Original Beef Jerky',
 'Zero Calorie Cola',
 'Bag of Organic Bananas',
 'Pistachios',
 'Trail Mix',
 'Clementines',
 '0% Greek Strained Yogurt',
 'Almonds',
 'Extra Fancy Unsalted Mixed Nuts']

In [901]:
product_id_to_name(get_recommended_products(user_selection[0], 200, include_prev=False, debug=True))

['Trail Mix',
 'Clementines',
 'Almonds',
 'Extra Fancy Unsalted Mixed Nuts',
 "Crunchy Oats 'n Honey Granola Bars",
 'Mineral Water',
 'Mixed Fruit Fruit Snacks',
 'Dried Mangos',
 'Hass Avocados',
 'Mozzarella String Cheese']

Actual recommended items from prev model

['Trail Mix',
 'Clementines',
 'Almonds',
 'Extra Fancy Unsalted Mixed Nuts',
 "Crunchy Oats 'n Honey Granola Bars",
 'Mineral Water',
 'Mixed Fruit Fruit Snacks',
 'Dried Mangos',
 'Hass Avocados',
 'Mozzarella String Cheese']

In [907]:
tfidf = joblib.load('tfidf.pkl')
tfidf_matrix = joblib.load('tfidx_matrix.pkl')

Use these pickle loaders for actual program