In [1]:
# Imports
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from datetime import datetime
from pathlib import Path

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time

In [2]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def timestamp():
    now = datetime.now()
    return "{}_{}_{}_{}{}".format(now.year, now.month, now.day, now.hour, now.minute)


def get_k_popular(k, df_merged_order_products_prior):
    popular_products = list(df_merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products

## Load datasets

In [3]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_order_products_train = pd.read_csv("../data/order_products__train.csv")
df_orders = pd.read_csv("../data/orders.csv") 

# Products
df_products = pd.read_csv("../data/products.csv")
# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [4]:
def make_test_data(test_data_path, df_orders, df_order_products_train):
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]

    # Write to disk
    df_user_products_test.to_csv(test_data_path, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get test data
REBUILD_TEST_DATA = False
test_data_path = "../data/user_products__test.csv"
if REBUILD_TEST_DATA or not Path(test_data_path).is_file():
    make_test_data(test_data_path, df_orders, df_order_products_train)
df_user_products_test = pd.read_csv(test_data_path)

Creating test data ...
Completed in 13.63s


In [5]:
# Just making sure that the test data isn't corrupted
assert len(df_user_products_test) == 131209

## Load Product Item Matrix

In [6]:
def get_user_product_prior_df(filepath, df_orders, df_order_products_prior):
    start = time.time()
    print("Creating prior user-product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Write to disk
    df_user_product_prior.to_csv(filepath, index_label=False)

    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users, products and quantity bought using prior datasets
REBUILD_MATRIX_DF = False
matrix_df_path = "../data/user_products__prior.csv"
if REBUILD_MATRIX_DF or not Path(matrix_df_path).is_file():
    get_user_product_prior_df(matrix_df_path, df_orders, df_order_products_prior)
df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

Creating prior user - product data frame ...
Completed in 61.91s


In [7]:
def build_product_user_matrix(matrix_path, df_user_product_prior):
    start = time.time()
    print("Creating product user matrix ...")
    
    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get the `product x user` matrix
REBUILD_MATRIX = True
matrix_path = "../data/product_user_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

Creating product user matrix ...
Completed in 11.04s


In [8]:
# User=1 bought product=196 10 times
assert product_user_matrix[195, 0] == 10

In [9]:
sparsity(product_user_matrix)

99.8700882953749

## Build Model

In [11]:
from implicit.nearest_neighbours import TFIDFRecommender


def build_tfidf(prod_user_matrix):
    start = time.time()
    
    # Build model
    print("Building TFIDF model ...")
    model = TFIDFRecommender()
    model.fit(prod_user_matrix)
    print("Completed in {:.2f}s".format(time.time() - start))
    return model

tfidf_model = build_tfidf(product_user_matrix)

Building TFIDF model ...
Completed in 11.11s


In [46]:
start = time.time()

for i in range(100):
    u_p_m = product_user_matrix.T.tocsr()
    target_user = u_p_m[i]
    recommendations = target_user.dot(tfidf_model.similarity)

print("Completed in {:.2f}s".format(time.time() - start))

Completed in 64.87s


In [10]:
def build_imf(prod_user_matrix, **kwargs):
    start = time.time()
    
    # Build model
    print("Building IMF model with alpha: {}".format(kwargs["alpha"]))
    model = AlternatingLeastSquares()
    model.approximate_similar_items = False
    
    if isinstance(kwargs["alpha"], int):
        model.fit((prod_user_matrix * kwargs["alpha"]).astype("double"))
    else:
        model.fit(bm25_weight(prod_user_matrix))

    # Save model to disk
    with open(kwargs["path"], "wb+") as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    
    print("Completed in {:.2f}s".format(time.time() - start))

# Specify model params and build it
model_params = {"alpha": 15}
model_params["path"] = "../models/imf/{}.imf".format(model_params["alpha"])

REBUILD_MODEL = False
if REBUILD_MODEL or not Path(model_params["path"]).exists():
    build_imf(product_user_matrix, **model_params)
with open(model_params["path"], "rb") as f:
    imf_model = pickle.load(f)

Building IMF model with alpha: 15
Completed in 139.15s


# Try out recommendations

In [11]:
# Maps user_id: user_cat_code
u_dict = {uid:i for i, uid in enumerate(df_user_product_prior["user_id"].cat.categories)}

# Maps product_cat_code: product_id
p_dict = dict(enumerate(df_user_product_prior["product_id"].cat.categories))

In [16]:
# Recommend items for a user
user_id = 1
recommendations = imf_model.recommend(u_dict[user_id], product_user_matrix.T.tocsr(), N = 10)

In [17]:
# Actual 
row = df_user_products_test.loc[df_user_products_test.user_id == user_id]
actual = list(row["products"])
actual = actual[0][1:-1]
actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))
act_products = []
for pid in actual:
    act_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
print("Actual products bought by user {}\n{}".format(user_id, act_products))

# Recommended
r = [p_dict[r[0]] for r in recommendations] # Takes the product_cat_code and maps to product_id
rec_products = []
for pid in r:
    rec_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
print("\nRecommendations for user {}\n{}".format(user_id, rec_products))

Actual products bought by user 1
['Soda', 'Organic String Cheese', '0% Greek Strained Yogurt', 'XL Pick-A-Size Paper Towel Rolls', 'Milk Chocolate Almonds', 'Pistachios', 'Cinnamon Toast Crunch', 'Aged White Cheddar Popcorn', 'Organic Whole Milk', 'Organic Half & Half', 'Zero Calorie Cola']

Recommendations for user 1
['Clementines', 'Trail Mix', 'Extra Fancy Unsalted Mixed Nuts', "Crunchy Oats 'n Honey Granola Bars", 'Whole Grain Cheddar Baked Snack Crackers', 'Mineral Water', 'Organic Simply Naked Pita Chips', 'Apples', 'Popcorn', 'Drinking Water']


## Evaluation

In [18]:
# Get the 10 most popular products
popular_products = get_k_popular(10, df_merged_order_products_prior)

# Maps user_id: user_cat_code
u_dict = {uid:i for i, uid in enumerate(df_user_product_prior["user_id"].cat.categories)}

# Maps product_cat_code: product_id
p_dict = dict(enumerate(df_user_product_prior["product_id"].cat.categories))

# Transpose of the product_user_matrix
user_product_matrix = product_user_matrix.T.tocsr()

In [19]:
print(user_product_matrix[0])

  (0, 195)	10
  (0, 10254)	9
  (0, 10322)	1
  (0, 12423)	10
  (0, 13028)	3
  (0, 13172)	2
  (0, 14080)	1
  (0, 17118)	1
  (0, 25129)	8
  (0, 26083)	2
  (0, 26400)	2
  (0, 30444)	1
  (0, 35945)	1
  (0, 38920)	1
  (0, 39649)	1
  (0, 41779)	1
  (0, 46139)	3
  (0, 49224)	2


In [20]:
user_product_matrix[0].tocsr().indices

array([  195, 10254, 10322, 12423, 13028, 13172, 14080, 17118, 25129,
       26083, 26400, 30444, 35945, 38920, 39649, 41779, 46139, 49224], dtype=int32)

In [49]:
def recall_score(actual, pred):
    if len(actual) == 0:
        return 0
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)


def new_products(row):
    actual = row["products"][1:-1]
    actual = set([int(p.strip()) for p in actual.strip().split(",")])
    liked = set([p_dict[i] for i in user_product_matrix[u_dict[row["user_id"]]].indices])
    return actual - liked


def popular_recommend(row):
    actual = new_products(row)
    return recall_score(actual, popular_products)

             
def imf_recommend(row):
    actual = new_products(row)
    recommended = imf_model.recommend(u_dict[row["user_id"]], user_product_matrix, N=10)
    recommended = [p_dict[r[0]] for r in recommended]
    return recall_score(actual, recommended)

             
def build_eval_df(product_user_matrix, df_user_products_test, filepath=None, subset=None):
    start = time.time()
    print("Building dataframe with recall values ...")
    
    df_eval = df_user_products_test.copy()
    if subset:
        df_eval = df_eval.sample(n=int(len(df_eval) * subset), random_state=7)
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval["imf_score"] = df_eval.apply(imf_recommend, axis=1)
    
    df_eval.to_csv(filepath)
    
    print("Completed in {:.2f}s".format(time.time() - start))    


# Get the dataframe with recall values of the baseline and the model
REBUILD_EVAL_DF = True
subset = 0.05
eval_path = "../data/eval/eval_discovery_{}.csv".format(subset if subset is not None else "full")
if REBUILD_EVAL_DF or not Path(eval_path).exists():
    build_eval_df(product_user_matrix, df_user_products_test, filepath=eval_path, subset=subset)
df_eval = pd.read_csv(eval_path)

Building dataframe with recall values ...
Completed in 23.43s


In [51]:
# Mean recall scores
model_mean_recall, baseline_mean_recall = np.mean(df_eval["imf_score"]), np.mean(df_eval["popular_score"])
print("Model: {}".format(model_mean_recall))
print("Baseline: {}".format(baseline_mean_recall))

Model: 0.0418546512876242
Baseline: 0.02702136841701362
