In [1]:
### Imports
### ------------------------> Necessary Block
from pathlib import Path
import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import operator
import ast
from scipy.spatial.distance import cosine
import collections
import heapq
import pickle
import joblib

In [2]:
# Functions

def make_prior_data():
    # Read prior order csv
    df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "prior")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_prior["order_id"].unique())

    # Group product_id for each order into products
    df_order_products_prior = df_order_products_prior[["order_id", "product_id"]]
    df_product_frequency = df_order_products_prior['product_id'].value_counts()
    df_order_products_prior = df_order_products_prior.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    
    
    assert current_order_user_df.size == df_order_products_prior.size

    df_prior_user_products = pd.merge(current_order_user_df, df_order_products_prior, on="order_id")
    df_prior_user_products = df_prior_user_products[["user_id", "products"]]
    df_prior_user_products = df_prior_user_products.groupby("user_id")["products"].agg(sum).reset_index()

    return df_prior_user_products, df_product_frequency

def save_data_to_disk(dataframe, df_name):
    filepath = "../dataframes/cb/cb_{}.pkl".format(df_name)
    dataframe.to_pickle(filepath)

## Load datasets

In [22]:
### ------------------------> Necessary Block
# Order datasets
df_orders = pd.read_csv("../data/orders.csv")
df_aisles = pd.read_csv("../data/aisles.csv")
df_departments = pd.read_csv("../data/departments.csv")
df_products = pd.read_csv("../data/products.csv")

In [9]:
############################################ Start of Skipping ############################################
# Running time: 3 min
df_prior_user_products, df_product_frequency = make_prior_data()

# Save data to disk, running time : 2 min
save_data_to_disk(df_prior_user_products, "user_products")
save_data_to_disk(df_product_frequency, "product_frequency")
#===========================================  End of Skipping  ============================================

In [9]:
### ------------------------> Necessary Block
# Read pickle from the disk
df_prior_user_products = pd.read_pickle("../dataframes/cb/cb_user_products.pkl")
df_product_frequency = pd.read_pickle("../dataframes/cb/cb_product_frequency.pkl")
df_product_frequency = pd.DataFrame(df_product_frequency).rename(columns={"product_id": "frequency"})

In [14]:
df_prior_user_products.head()

Unnamed: 0,user_id,products
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [15]:
df_product_frequency.head()

Unnamed: 0,frequency
24852,472565
13176,379450
21137,264683
21903,241921
47209,213584


In [14]:
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [44]:
df_aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [16]:
df_departments.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [17]:
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


## Build tf-idf

In [8]:
# Total users (rows/documents) in the utility matrix
print('Total users: {}'.format(len(df_prior_user_products)))

Total users: 206209


In [15]:
############################################ Start of Skipping ############################################
# Calculate term (item) frequency

# Running time: 19 min for the first 100,000 rows
dummy_row = pd.DataFrame([[0, [i+1 for i in range(49688)]]], columns=['user_id','products'])
tf1 = pd.concat([dummy_row, df_prior_user_products[:100000]], ignore_index=True).products.apply(pd.value_counts, dropna=False).fillna(0)

# Save the rest of rows of tf to the disk
# Running time: 
joblib.dump(tf1,'../dataframes/cb/cb_tf1_compressed.pkl', compress=3)
#===========================================  End of Skipping  ============================================

In [5]:
############################################ Start of Skipping ############################################
# Running time: 19 min for the rest of rows
dummy_row = pd.DataFrame([[0, [i+1 for i in range(49688)]]], columns=['user_id','products'])
tf2 = pd.concat([dummy_row, df_prior_user_products[:10]], ignore_index=True).products.apply(pd.value_counts, dropna=False).fillna(0)

# Save the rest of rows of tf to the disk
# Running time: 15 min
joblib.dump(tf2,'../dataframes/cb/cb_tf2_compressed.pkl', compress=3)
#===========================================  End of Skipping  ============================================

In [2]:
# Load tf1 from the disk
# Reading time: 7 min
tf1 = joblib.load("../dataframes/cb/cb_tf1_compressed.pkl")

In [3]:
# Load tf2 from the disk
# Reading time: 9 min
tf2 = joblib.load("../dataframes/cb/cb_tf2_compressed.pkl")

In [6]:
############################################ Start of Skipping ############################################
# Calculate Inverse Document Frequency
# Running time: 11 min
# using simple smoothing
# gt(num) = greater than num
idf = np.log((len(df_prior_user_products) + 1 ) / ((tf1.gt(0).sum() + tf2.gt(0).sum()) + 1)).fillna(0)

# Save idf to the disk

joblib.dump(idf,'../dataframes/cb/cb_idf.pkl')
#===========================================  End of Skipping  ============================================

In [3]:
# Load idf from the disk
idf = joblib.load("../dataframes/cb/cb_idf.pkl")

In [None]:
############################################ Start of Skipping ############################################
# Calculate tf_idf

tf1_idf = tf1 * idf
tf1_idf.head()

# Save tf1_idf to the disk
joblib.dump(tf1_idf,'../dataframes/cb/cb_tf1_idf_compressed.pkl', compress=2)

In [None]:
tf2_idf = tf2 * idf
tf2_idf.head()

# Save tf2_idf to the disk
joblib.dump(tf2_idf,'../dataframes/cb/cb_tf2_idf_compressed.pkl', compress=2)
#===========================================  End of Skipping  ============================================

In [2]:
### ------------------------> Necessary Block
# Load tf1_idf and tf2_idf from the disk
# Running time: 8 min
tf1_idf = joblib.load("../dataframes/cb/cb_tf1_idf_compressed.pkl")

In [3]:
### ------------------------> Necessary Block
# Running time: 9 min
tf2_idf = joblib.load("../dataframes/cb/cb_tf2_idf_compressed.pkl")

In [5]:
# Calculate Cosine Distance
# Running time: 17 min
def fetchMostSimilarUserTo(target_user, tf_idf):
    cos_vec = tf_idf.apply(lambda other_user: 1 - cosine(target_user, other_user), axis=1).fillna(0)
    return cos_vec

def generateCosineSimilarities(target_user, tf1_idf, tf2_idf):
    cos_vec1 = fetchMostSimilarUserTo(target_user, tf1_idf)

    # Save cos_vec1 to the disk
    joblib.dump(cos_vec1,'../dataframes/cb/cb_cos_vec1_target_user_' + str(target_user_id) + '.pkl')
    
    cos_vec2 = fetchMostSimilarUserTo(target_user, tf2_idf)

    # Save cos_vec2 to the disk
    joblib.dump(cos_vec2,'../dataframes/cb/cb_cos_vec2_target_user_' + str(target_user_id) + '.pkl')

In [24]:
def generateRecommendations(target_user_id, K, N, df_prior_user_products, df_product_frequency):

    # Load cos_vec1 and cos_vec2 from the disk
    cos_vec1 = joblib.load('../dataframes/cb/cb_cos_vec1_target_user_' + str(target_user_id) + '.pkl')
    cos_vec2 = joblib.load('../dataframes/cb/cb_cos_vec2_target_user_' + str(target_user_id) + '.pkl')

    # Drop dummy rows
    cos_vec1 = cos_vec1.drop(cos_vec1.index[0])
    cos_vec2 = cos_vec2.drop(cos_vec1.index[0])
    
    # Concat two cos_vecs
    cos_vec = df = pd.concat([cos_vec1, cos_vec2], ignore_index=True)
    
    # Select top K similar users
    top_K_similar_uers = cos_vec.nlargest(K)
    
    # Exclude the user with same purchase history (1.00000) as the target user and implement set-minus
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == target_user_id].products

    # Initialize the result for recommendations
    recommendations = []

    # Products of Target User
    productset_target_user = set(products_target_user.tolist()[0])

    # Fetch the preliminary recommendations
    for i, similarity in top_K_similar_uers.iteritems():

        products_similar_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == i + 1].products

        # Recommend the products bought by the user who firstly differs in the purchase history from A.
        candidate_recommendation = set(products_similar_user.tolist()[0]) - set(products_target_user.tolist()[0])

        # If similarity equals to 1 (namely same purchase history) or the candidate_recommendation is empty,
        # skip current user
        if similarity // 1 == 1 or not candidate_recommendation: continue

        # One candidate_recommendation found, and extend it to the result
        recommendations.extend(candidate_recommendation)

        # If length of recommendations exceed N, break
        if len(recommendations) > N: break
    
    # Pick the top N popularity (overall sales) to recommend
    h = []
    for rec in recommendations:
        heapq.heappush(h, (df_product_frequency.loc[rec]['frequency'], rec))
        if len(h) > N:
            heapq.heappop(h)
            
    return productset_target_user, h

In [7]:
### ------------------------> Necessary Block
# Change target_user_id HERE.
target_user_id = 1
# How many similar users?
K = 10
# How many products to be recommended?
N = 10

if target_user_id <= 100000:
    target_user = tf1_idf.loc[target_user_id]
else:
    target_user = tf2_idf.loc[target_user_id - 100000]

In [None]:
# If the similarity scores of target_user_id already saved to disk, skip this method
generateCosineSimilarities(target_user, tf1_idf, tf2_idf)

In [25]:
### ------------------------> Necessary Block
productset_target_user, recommendations = generateRecommendations(target_user_id, K, N, df_prior_user_products, df_product_frequency)

In [26]:
### ------------------------> Necessary Block
# Output the product_name of Target User's products as well as Recommendations
print('Actual products bought by User {}:'.format(target_user_id))
print([df_products.iloc[product_id]['product_name'] for product_id in productset_target_user])
print()
print('Recommended products for User {}:'.format(target_user_id))
print([df_products.iloc[item[1]]['product_name'] for item in recommendations])

Actual products bought by User 1:
['Organic Honeydew', 'Cold Brew Coffee Tahitian Vanilla', "Spot's Pate Cat Grain Free Ground Whitefish", 'Epic Fruit & Yogurt Filled Pouches', 'Beet Kombucha', 'Broccoli Squash Carrots Onion Red Pepper Steamables', 'Grape Nut Flakes Cereal', 'Triple Distilled Irish Whiskey', "Steam'ables Green Peas", 'Cilantro Bunch', 'Peachtree Schnapps', 'Original Pretzel Crisps', 'Chocolate Caramel Pudding Snack Pack', 'Pasta & Enchilada Sauce, Organic, 7 Veggie', '80% Lean Ground Beef', 'Creamy Chicken & Shrimp in a Parmesan Alfredo Sauce', 'Warrior Blend Vanilla Dietary Supplement', 'Organic Creamy Cashewmilk']

Recommended products for User 1:
['Organic Hot Dog Buns', 'Curly Waves Potato Chips', 'Premium Genoa', 'Gluten Free Pecan Shortbread Cookies', 'Original Sweet Chili Dipping Sauce', 'CleanWear Ultra Thin Regular with Wings Pads', 'Valdosta Pecans With Cranberries, Black Pepper & Orange Zest', 'Original Antacid & Pain Relief Tablets 36 Ct', 'Old Fashioned Ve

# Playground

In [26]:
df1 = pd.DataFrame([
        [1, 196],
        [1, 196],
        [1, 134],
        [1, 196],
        [1, 134]
    ], columns=['order_id', 'product_id'])

df1.head()

Unnamed: 0,order_id,product_id
0,1,196
1,1,196
2,1,134
3,1,196
4,1,134


In [3]:
df2 = pd.DataFrame([
        [2, 196],
        [2, 196],
        [2, 134],
        [2, 196],
        [2, 134]
    ], columns=['order_id', 'products'])

df2.head()

Unnamed: 0,order_id,product_id
0,2,196
1,2,196
2,2,134
3,2,196
4,2,134


In [30]:
df = pd.concat([df1, df2], ignore_index=True)

In [6]:
df2['product_id'][:1].value_counts(dropna=False)

196    1
Name: product_id, dtype: int64

In [5]:
df_products.iloc[0]['product_name']

'Chocolate Sandwich Cookies'

In [29]:

df_prior_user_products[:2].products.apply(pd.value_counts, dropna=False).fillna(0)

0    [196, 14084, 12427, 26088, 26405, 196, 10258, ...
1    [32792, 47766, 20574, 12000, 48110, 22474, 165...
Name: products, dtype: object

In [51]:
dummy_row = pd.DataFrame([[0, [i+1 for i in range(49688)]]], columns=['user_id','products'])
temp = df_prior_user_products[:2]
temp = pd.concat([dummy_row, temp], ignore_index=True)
temp.products.apply(pd.value_counts, dropna=False).fillna(0)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,49679,49680,49681,49682,49683,49684,49685,49686,49687,49688
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
