# Import

In [280]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from collections import Counter
import time
from random import choices

# Data

## Load

In [86]:
order_products__train = pd.read_csv("../instacart_2017_05_01/order_products__train.csv")
order_products__prior = pd.read_csv("../instacart_2017_05_01/order_products__prior.csv")
products = pd.read_csv("../instacart_2017_05_01/products.csv")
orders = pd.read_csv("../instacart_2017_05_01/orders.csv")

## Train

In [87]:
orders_train = orders[orders['eval_set'] == 'train'].reset_index()
orders_train = orders_train[['user_id', 'order_id']]

In [88]:
order_products__train = order_products__train[['order_id', 'product_id', 'reordered']]
order_products__train = order_products__train.groupby('order_id')['product_id'].apply(list).reset_index()
order_products__train.head()

Unnamed: 0,order_id,product_id
0,1,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
1,36,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
2,38,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
3,96,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
4,98,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


In [89]:
user_products__train = pd.merge(order_products__train, orders_train, on='order_id')
user_products__train = user_products__train[['user_id', 'product_id']]

In [90]:
user_products__train[user_products__train.duplicated(['user_id'], keep=False)]

Unnamed: 0,user_id,product_id


In [91]:
user_products__train.set_index('user_id', inplace=True)

In [92]:
user_products__train.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
112108,"[49302, 11109, 10246, 49683, 43633, 13176, 472..."
79431,"[39612, 19660, 49235, 43086, 46620, 34497, 486..."
42756,"[11913, 18159, 4461, 21616, 23622, 32433, 2884..."
17227,"[20574, 30391, 40706, 25610, 27966, 24489, 39275]"
56463,"[8859, 19731, 43654, 13176, 4357, 37664, 34065..."


### Save/Load dataframes

In [52]:
user_products__train.to_csv("user_products__train.csv")

In [53]:
try:
    user_products__train
except NameError:
    user_products__train = pd.read_csv("user_products__train.csv")
else:
    print("user_products__train already exists")

user_products__train already exists


In [176]:
test_subjects = user_products__train.index

## Prior

In [6]:
orders_prior = orders[orders['eval_set'] == 'prior'].reset_index()
orders_prior = orders_prior[['user_id', 'order_id']]

In [7]:
order_products__prior = order_products__prior[['order_id', 'product_id']]

In [14]:
user_products__prior = pd.merge(orders_prior, order_products__prior, on='order_id', how='inner')
user_products__prior = user_products__prior[['user_id', 'product_id']]
user_products__prior = user_products__prior.groupby('user_id')['product_id'].apply(list).reset_index()
user_products__prior.head()

Unnamed: 0,user_id,product_id
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


### Save/Load dataframes

In [16]:
try:
    user_products__prior
except NameError:
    user_products__prior = pd.read_csv('user_products__prior.csv')
else:
    print("user_products__prior already exists")

user_products__prior already exists


In [42]:
user_products_lookup = user_products__prior.set_index('user_id', drop=True)
user_products_lookup.head()

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


## Prior sentences

In [17]:
def make_sentences(product_id_list):
    string_list = [str(product_id) for product_id in product_id_list]
    return " ".join(string_list)

In [18]:
user_products_sentence__prior = user_products__prior.copy()
user_products_sentence__prior['product_id'] = user_products_sentence__prior['product_id'].apply(make_sentences)
user_products_sentence__prior.head()

Unnamed: 0,user_id,product_id
0,1,196 14084 12427 26088 26405 196 10258 12427 13...
1,2,32792 47766 20574 12000 48110 22474 16589 3591...
2,3,9387 17668 15143 16797 39190 47766 21903 39922...
3,4,36606 7350 35469 2707 42329 7160 1200 17769 43...
4,5,15349 21413 48775 28289 8518 11777 31717 26604...


In [19]:
user_products_sentence__prior.to_csv('user_products_sentence__prior.csv')

In [20]:
try:
    user_products__prior
except NameError:
    user_products_sentence__prior = pd.read_csv('user_products_sentence__prior.csv')
else:
    print("user_products_sentence__prior already exists")

user_products_sentence__prior already exists


# Model

In [24]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(user_products_sentence__prior['product_id'])

In [33]:
# This will convert your user_ids to the matrix indices
indices = pd.Series(user_products_sentence__prior.index, index=user_products_sentence__prior['user_id']).drop_duplicates()

In [34]:
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Not enough memory for this unfortunately

In [None]:
# This approach also doesn't work due to memory issues

# # Function that takes in user_id as input and outputs most similar movies
# def get_recommendations(user_id, cosine_sim=cosine_sim, k):
#     # Get the index of the user that matches the user_id
#     idx = indices[user_id]

#     # Get the pairwsie similarity scores of all movies with that movie
#     sim_scores = list(enumerate(cosine_sim[idx]))

#     # Sort the users based on the similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     # Get the scores of the k most similar movies, skip the first one since it will be itself
#     sim_scores = sim_scores[1:k]

#     # Get the user indices
#     user_indices = [i[0] for i in sim_scores]

#     # Return the top 10 most similar users
#     return user_products_sentence__prior['user_id'].iloc[user_indices]

In [286]:
# Function that takes in user_id as input and outputs most similar movies
def get_recommendations(user_id, k, n=10):
    # Get the index of the user that matches the user_id
    idx = indices[user_id]
    
    cosine_sim = linear_kernel(tfidf_matrix[idx], tfidf_matrix)[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim))

    start = time.time()
    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the k most similar movies, skip the first one since it will be itself
    sim_scores = sim_scores[1:k]

    # Get the user indices
    user_indices = [i[0] for i in sim_scores]

    # The top k most similar users
    top_k_users = user_products_sentence__prior['user_id'].iloc[user_indices]
    
    # The products purchased from these top k users
    top_k_user_products = user_products_lookup.loc[top_k_users]
    product_list = [i for j in top_k_user_products['product_id'].tolist() for i in j]
    
    # Count the frequency of the items purchased and sort by popularity
    product_counts = dict(Counter(product_list))
    popular_products = sorted(product_counts, key=product_counts.get, reverse=True)
    
    # Return the top n purchaed product by the k most similar users
    return popular_products[:n]
    

In [250]:
# Recommener function will output product_ids. Use this dict to then get human names for food
product_id_to_name_dict = pd.Series(products['product_name'].values, index=products['product_id']).to_dict()

def product_id_to_name(product_ids):
    return [product_id_to_name_dict[id] for id in product_ids]

In [251]:
def get_products_really_purcased(user_id):
    return user_products__train.loc[user_id].tolist()[0]

## Example

In [287]:
start = time.time()
recommended_items = get_recommendations(13, 20)
print(product_id_to_name(recommended_items))

print(f"Took {time.time() - start}s")

['Whole Milk', 'Half & Half', 'Fat Free Milk', 'Almond Breeze Original Almond Milk', 'Original Pure Creamy Almond Milk', 'Semi-Sweet Chocolate Premium Baking Chips', 'Original Coconut Milk Beverage', 'Ice Bag', 'Lemonade', 'Original Chai Tea Latte']
Took 0.9555344581604004s


Takes around a second for each prediction—which is very long when I think of all the test data I have

In [230]:
real_items = get_products_really_purcased(13)
product_id_to_name(real_items)

['Whole Wheat Pita Bread Loaves',
 'Half & Half',
 'Whole Milk',
 'Lavender & Witch Hazel Alcohol-Free Toner',
 'Fat Free Smooth & Creamy Plain Organic Yogurt']

In [232]:
# Accuracy
len(set(recommended_items).intersection(set(real_items))) / len(set(real_items))

0.4

# Hyperparameter optimization

In [240]:
len(test_subjects)

131209

In [182]:
progress = len(test_subjects)

In [190]:
def get_model_accuracy(user_ids, k, n=10):
    accuracies = []
    for user_id in user_ids:
        recommended_items = set(get_recommendations(user_id, k, n))
        real_items = set(get_products_really_purcased(user_id))
        accuracy = len(recommended_items.intersection(real_items)) / len(real_items)
        accuracies.append(accuracy)
    model_accuracy = np.mean(accuracies)
    return model_accuracy

In [None]:
# While it would be nice to do this, it simply takes too long
k_values = [50, 100, 200, 300]

accuracies = {}
for k in k_values:
    accuracy = get_model_accuracy(test_subjects, k, n=10)
    accuracies[k] = accuracy
    print(f"{k}: {accuracy * 100}%")

In [290]:
random_users = choices(test_subjects, k=10000)

In [289]:
start = time.time()
k_values = [50, 100, 200]

accuracies = {}
for k in k_values:
    accuracy = get_model_accuracy(random_users, k, n=10)
    print(f"{k}: {accuracy * 100}%")
    accuracies[k] = accuracy
print(f"This took {time.time() - start}s")

NameError: name 'accuracies_50' is not defined