In [58]:
# Imports

from implicit.nearest_neighbours import tfidf_weight
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time
import heapq

In [2]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def timestamp():
    now = datetime.now()
    return "{}_{}_{}_{}{}".format(now.year, now.month, now.day, now.hour, now.minute)


def get_k_popular(k, df_merged_order_products_prior):
    popular_products = list(df_merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products

# Load datasets

In [43]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_order_products_train = pd.read_csv("../data/order_products__train.csv")
df_orders = pd.read_csv("../data/orders.csv") 

# Products
df_products = pd.read_csv("../data/products.csv")
# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

# Read user_products and product_frequency from the disk
df_prior_user_products = pd.read_pickle("../dataframes/cb/cb_user_products.pkl")

In [4]:
def make_test_data(test_data_path, df_orders, df_order_products_train):
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]

    # Write to disk
    df_user_products_test.to_csv(test_data_path, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get test data
REBUILD_TEST_DATA = False
test_data_path = "../data/user_products__test.csv"
if REBUILD_TEST_DATA or not Path(test_data_path).is_file():
    make_test_data(test_data_path, df_orders, df_order_products_train)
df_user_products_test = pd.read_csv(test_data_path)

In [87]:
df_user_products_test.head()

Unnamed: 0,user_id,products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


In [89]:
def func(row):
    print(row["user_id"])

df_user_products_test[:3].apply(func, axis=1)

1
2
5


0    None
1    None
2    None
dtype: object

# Load Product Item Matrix

In [5]:
def get_user_product_prior_df(filepath, df_orders, df_order_products_prior):
    start = time.time()
    print("Creating prior user product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Write to disk
    df_user_product_prior.to_csv(filepath, index_label=False)

    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users, products and quantity bought using prior datasets
REBUILD_MATRIX_DF = False
matrix_df_path = "../data/user_products__prior.csv"
if REBUILD_MATRIX_DF or not Path(matrix_df_path).is_file():
    get_user_product_prior_df(matrix_df_path, df_orders, df_order_products_prior)
df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

In [6]:
def build_product_user_matrix(matrix_path, df_user_product_prior):
    start = time.time()
    print("Creating product user matrix ...")
    
    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get the `product x user` matrix
REBUILD_MATRIX = False
matrix_path = "../data/product_user_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

In [7]:
# User=1 bought product=196 10 times
assert product_user_matrix[195, 0] == 10

In [8]:
sparsity(product_user_matrix)

99.8700882953749

# TF-IDF

In [9]:
user_product_matrix = product_user_matrix.T

In [10]:
def tfidf_weight(X):
    """ Weights a Sparse Matrix by TF-IDF Weighted """
    X = coo_matrix(X)

    # calculate IDF
    N = float(X.shape[0])
    idf = log(N / (1 + bincount(X.col)))

    # apply TF-IDF adjustment
    X.data = sqrt(X.data) * idf[X.col]
    return X

tf_idf = tfidf_weight(user_product_matrix)

In [11]:
tf_idf = tf_idf.tocsr()

In [93]:
target_user_id = 1
target_user = tf_idf[target_user_id - 1]

In [148]:
tf_idf.shape

(206209, 49677)

In [149]:
start = time.time()
similarities = cosine_similarity(tf_idf, target_user, False)
print("Completed in {:.2f}s".format(time.time() - start))

Completed in 0.26s


In [172]:
# 
cos_vec = similarities.toarray()

In [188]:
def generateRecommendations(target_user, cos_vec, K, N, df_prior_user_products):
    # Select top K similar users
    top_K_similar_users = heapq.nlargest(K+1, range(len(cos_vec)), cos_vec.take)
    
    # Exclude the user with same purchase history (1.00000) as the target user and implement set-minus
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == target_user_id].products

    # Initialize the result for recommendations
    recommendations = []

    # Products of Target User
    productset_target_user = set(products_target_user.tolist()[0])

    # Fetch the preliminary recommendations
    for similar_user_id in top_K_similar_users:
        
        products_similar_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == similar_user_id + 1].products

        # Recommend the products bought by the user who firstly differs in the purchase history from A.
        candidate_recommendation = set(products_similar_user.tolist()[0]).intersection(productset_target_user)

        # If similar_user_id equals to target_user_id or the candidate_recommendation is empty,
        # skip current user
        if similar_user_id == target_user_id or not candidate_recommendation: continue

        # One candidate_recommendation found, and extend it to the result
        recommendations.extend(candidate_recommendation)

#         If length of recommendations exceed N, break
        if len(recommendations) > N: break
            
    return productset_target_user, recommendations

In [189]:
productset_target_user, recommendations = generateRecommendations(target_user, similarities.toarray(), 20, 20, df_prior_user_products)

In [190]:
# Output the product_name of Target User's products as well as Recommendations
print('Actual products bought by User {}:'.format(target_user_id))
print([df_products.iloc[product_id]['product_name'] for product_id in productset_target_user])
print()
print('Recommended products for User {}:'.format(target_user_id))
print([df_products.iloc[item]['product_name'] for item in recommendations])

Actual products bought by User 1:
['Organic Honeydew', 'Cold Brew Coffee Tahitian Vanilla', "Spot's Pate Cat Grain Free Ground Whitefish", 'Epic Fruit & Yogurt Filled Pouches', 'Beet Kombucha', 'Broccoli Squash Carrots Onion Red Pepper Steamables', 'Grape Nut Flakes Cereal', 'Triple Distilled Irish Whiskey', "Steam'ables Green Peas", 'Cilantro Bunch', 'Peachtree Schnapps', 'Original Pretzel Crisps', 'Chocolate Caramel Pudding Snack Pack', 'Pasta & Enchilada Sauce, Organic, 7 Veggie', '80% Lean Ground Beef', 'Creamy Chicken & Shrimp in a Parmesan Alfredo Sauce', 'Warrior Blend Vanilla Dietary Supplement', 'Organic Creamy Cashewmilk']

Recommended products for User 1:
['Organic Honeydew', 'Cold Brew Coffee Tahitian Vanilla', 'Beet Kombucha', "Spot's Pate Cat Grain Free Ground Whitefish", 'Epic Fruit & Yogurt Filled Pouches', 'Broccoli Squash Carrots Onion Red Pepper Steamables', 'Grape Nut Flakes Cereal', 'Triple Distilled Irish Whiskey', "Steam'ables Green Peas", 'Cilantro Bunch', 'Peac

# Evaluation

In [182]:
# Get the 10 most popular products
popular_products = get_k_popular(10, df_merged_order_products_prior)

# Maps user_id: user_cat_code
u_dict = {uid:i for i, uid in enumerate(df_user_product_prior["user_id"].cat.categories)}

# Maps product_cat_code: product_id
p_dict = dict(enumerate(df_user_product_prior["product_id"].cat.categories))

# Transpose of the product_user_matrix
user_product_matrix = product_user_matrix.T

In [186]:
# How many users in the test?
total = 500
# Counter
count = 0

def recall_score(actual, pred):
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)

def popular_recommend(row):
    actual = row["products"][1:-1]
    actual = [int(p.strip()) for p in actual.strip().split(",")]
    return recall_score(actual, popular_products)

def tfidf_recommend(row):
    actual = row["products"][1:-1]
    actual = [int(p.strip()) for p in actual.strip().split(",")]
    target_user = tf_idf[row["user_id"] - 1]
    similarities = cosine_similarity(tf_idf, target_user, False)
    cos_vec = similarities.toarray()
    productset_target_user, recommended = generateRecommendations(target_user, cos_vec, 50, 20, df_prior_user_products, df_product_frequency)
    global count
    count += 1
    print("{:.2f}% completed, recall_score = {:.2f}".format(count / total * 100.0, recall_score(actual, recommended)))    
    return recall_score(actual, recommended)

def build_eval_df(filepath, df_user_products_test, subset=None):
    start = time.time()
    print("Building dataframe with recall values ...")
    
    df_eval = df_user_products_test.copy()
    if subset:
        df_eval = df_eval[:subset].copy()
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval["tfidf_score"] = df_eval.apply(tfidf_recommend, axis=1)
    df_eval.to_csv(filepath) #, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))    


# Get the dataframe with recall values of the baseline and the model
REBUILD_EVAL_DF = False
subset = 500
eval_path = "../data/eval/eval_{}.csv".format(subset if subset is not None else "full")
if REBUILD_EVAL_DF or not Path(eval_path).exists():
    build_eval_df(eval_path, df_user_products_test, subset=subset)
df_eval = pd.read_csv(eval_path)

Building dataframe with recall values ...
0.20% completed, recall_score = 0.91
0.40% completed, recall_score = 0.03
0.60% completed, recall_score = 0.00
0.80% completed, recall_score = 0.00
1.00% completed, recall_score = 0.00
1.20% completed, recall_score = 0.00
1.40% completed, recall_score = 0.00
1.60% completed, recall_score = 0.00
1.80% completed, recall_score = 0.00
2.00% completed, recall_score = 0.00
2.20% completed, recall_score = 0.00
2.40% completed, recall_score = 0.00
2.60% completed, recall_score = 0.00
2.80% completed, recall_score = 0.00
3.00% completed, recall_score = 0.00
3.20% completed, recall_score = 0.00
3.40% completed, recall_score = 0.00
3.60% completed, recall_score = 0.00
3.80% completed, recall_score = 0.00
4.00% completed, recall_score = 0.00
4.20% completed, recall_score = 0.00
4.40% completed, recall_score = 0.00
4.60% completed, recall_score = 0.00
4.80% completed, recall_score = 0.00
5.00% completed, recall_score = 0.00
5.20% completed, recall_score = 0

43.40% completed, recall_score = 0.00
43.60% completed, recall_score = 0.00
43.80% completed, recall_score = 0.00
44.00% completed, recall_score = 0.00
44.20% completed, recall_score = 0.00
44.40% completed, recall_score = 0.00
44.60% completed, recall_score = 0.00
44.80% completed, recall_score = 0.00
45.00% completed, recall_score = 0.12
45.20% completed, recall_score = 0.06
45.40% completed, recall_score = 0.00
45.60% completed, recall_score = 0.05
45.80% completed, recall_score = 0.00
46.00% completed, recall_score = 0.00
46.20% completed, recall_score = 0.00
46.40% completed, recall_score = 0.00
46.60% completed, recall_score = 0.00
46.80% completed, recall_score = 0.00
47.00% completed, recall_score = 0.50
47.20% completed, recall_score = 0.00
47.40% completed, recall_score = 0.00
47.60% completed, recall_score = 0.00
47.80% completed, recall_score = 0.00
48.00% completed, recall_score = 0.00
48.20% completed, recall_score = 0.00
48.40% completed, recall_score = 0.00
48.60% compl

86.60% completed, recall_score = 0.00
86.80% completed, recall_score = 0.04
87.00% completed, recall_score = 0.00
87.20% completed, recall_score = 0.00
87.40% completed, recall_score = 0.00
87.60% completed, recall_score = 0.00
87.80% completed, recall_score = 0.00
88.00% completed, recall_score = 0.00
88.20% completed, recall_score = 0.20
88.40% completed, recall_score = 0.00
88.60% completed, recall_score = 0.00
88.80% completed, recall_score = 0.00
89.00% completed, recall_score = 0.00
89.20% completed, recall_score = 0.05
89.40% completed, recall_score = 0.00
89.60% completed, recall_score = 0.00
89.80% completed, recall_score = 0.00
90.00% completed, recall_score = 0.00
90.20% completed, recall_score = 0.00
90.40% completed, recall_score = 0.00
90.60% completed, recall_score = 0.00
90.80% completed, recall_score = 0.00
91.00% completed, recall_score = 0.00
91.20% completed, recall_score = 0.00
91.40% completed, recall_score = 0.00
91.60% completed, recall_score = 0.00
91.80% compl

In [187]:
# Mean recall scores
model_mean_recall, baseline_mean_recall = np.mean(df_eval["tfidf_score"]), np.mean(df_eval["popular_score"])
print("Model: {}".format(model_mean_recall))
print("Baseline: {}".format(baseline_mean_recall))

Model: 0.031384215308010936
Baseline: 0.06807380013600955
