In [1]:
# Imports
from implicit.als import AlternatingLeastSquares
from datetime import datetime
from pathlib import Path

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time

In [168]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def timestamp():
    now = datetime.now()
    return "{}_{}_{}_{}{}".format(now.year, now.month, now.day, now.hour, now.minute)


def get_k_popular(k, df_merged_order_products_prior):
    popular_products = list(df_merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products

## Load datasets

In [3]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_order_products_train = pd.read_csv("../data/order_products__train.csv")
df_orders = pd.read_csv("../data/orders.csv") 

# Products
df_products = pd.read_csv("../data/products.csv")
# Merged prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [129]:
def make_test_data(test_data_path, df_orders, df_order_products_train):
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]

    # Write to disk
    df_user_products_test.to_csv(test_data_path, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get test data
REBUILD_TEST_DATA = False
test_data_path = "../data/user_products__test.csv"
if REBUILD_TEST_DATA or not Path(test_data_path).is_file():
    make_test_data(test_data_path, df_orders, df_order_products_train)
df_user_products_test = pd.read_csv(test_data_path)

## Load Product Item Matrix

In [132]:
def build_prior_user_product_df(df_orders, df_order_products_prior):
    start = time.time()
    print("Creating prior user product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    
    print("Completed in {:.2f}s".format(time.time() - start))
    return df_user_product_prior


def build_product_user_matrix(matrix_path, df_user_product_prior):
    start = time.time()
    print("Creating product user matrix ...")
    
    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users, products and quantity bought using prior datasets
df_user_product_prior = build_prior_user_product_df(df_orders, df_order_products_prior)

# Get the `product x user` matrix
REBUILD_MATRIX = False
matrix_path = "../data/product_user_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

Creating prior user product data frame ...
Completed in 22.41s
Creating product user matrix ...
Completed in 10.83s


In [133]:
# User=1 bought product=196 10 times
assert product_user_matrix[195, 0] == 10

In [134]:
sparsity(product_user_matrix)

99.8700882953749

## Build Model

In [135]:
temp = product_user_matrix * 15
print(temp[195, 0])

150


In [143]:
def build_imf(prod_user_matrix, **kwargs):
    start = time.time()
    
    # Build model
    print("Building model with alpha: {} and factors: {} ...".format(kwargs["alpha"], kwargs["factors"]))
    model = AlternatingLeastSquares(factors=kwargs["factors"])
    model.approximate_similar_items = False
    model.fit((prod_user_matrix * kwargs["alpha"]).astype("double"))

    # Save model to disk
    with open(kwargs["path"], "wb+") as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    
    print("Completed in {:.2f}s".format(time.time() - start))

# Specify model params and build it
model_params = {"alpha": 15, "factors": 100}
model_params["path"] = "../models/imf/imf_a{}_f{}.imf".format(model_params["alpha"], model_params["factors"])

REBUILD_MODEL = True
if REBUILD_MODEL or not Path(model_params["path"]).exists():
    build_imf(product_user_matrix, **model_params)
with open(model_params["path"], "rb") as f:
    imf_model = pickle.load(f)

Building model with alpha: 15 and factors: 100 ...
Completed in 114.34s


# Try out Recommendations

In [144]:
df_user_product_prior.head()

Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


In [160]:
# Maps user_id: user_cat_code
u_dict = {uid:i for i, uid in enumerate(df_user_product_prior["user_id"].cat.categories)}

# Maps product_cat_code: product_id
p_dict = dict(enumerate(df_user_product_prior["product_id"].cat.categories))

In [191]:
# Recommend items for a user
user_id = 1
recommendations = imf_model.recommend(u_dict[user_id], product_user_matrix.T, N = 10)

In [195]:
# df_user_product_prior.loc[df_user_product_prior.user_id == user_id]

In [196]:
# u_p_m = product_user_matrix.T
# print(set(u_p_m[u_dict[user_id]].indices))

In [197]:
# Actual 
row = df_user_products_test.loc[df_user_products_test.user_id == user_id]
actual = list(row["products"])
actual = actual[0][1:-1]
actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))
act_products = []
for pid in actual:
    act_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
print("Actual products bought by user {}\n{}".format(user_id, act_products))

# Recommended
r = [p_dict[r[0]] for r in recommendations] # Takes the product_cat_code and maps to product_id
rec_products = []
for pid in r:
    rec_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
print("\nRecommendations for user {}\n{}".format(user_id, rec_products))

Actual products bought by user 1
['Soda', 'Organic String Cheese', '0% Greek Strained Yogurt', 'XL Pick-A-Size Paper Towel Rolls', 'Milk Chocolate Almonds', 'Pistachios', 'Cinnamon Toast Crunch', 'Aged White Cheddar Popcorn', 'Organic Whole Milk', 'Organic Half & Half', 'Zero Calorie Cola']

Recommendations for user 1
['Organic Half & Half', 'Trail Mix', '0% Greek Strained Yogurt', 'Clementines', 'Extra Fancy Unsalted Mixed Nuts', 'Soda', 'Original Beef Jerky', "Crunchy Oats 'n Honey Granola Bars", 'Zero Calorie Cola', 'Milk Chocolate Almonds']


## Evaluation

In [198]:
popular_products = get_k_popular(10, df_merged_order_products_prior)

In [209]:
start = time.time()
print("Evaluating model ...")
    
def recall_score(actual, pred):
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)


user_product_matrix = product_user_matrix.T #.tocsr()
model_recalls = []
popular_recalls = []

for index, row in df_user_products_test.iterrows():
    try:
        actual = row["products"][1:-1]
        actual = [int(p.strip()) for p in actual.strip().split(",")]
    
        recommended = imf_model.recommend(u_dict[row["user_id"]], user_product_matrix, N=10)
        recommended = [p_dict[r[0]] for r in recommended]
        
        model_recalls.append(recall_score(actual, recommended))
        popular_recalls.append(recall_score(actual, popular_products))

    
model_mean_recall = np.mean(model_recalls)
popular_mean_recall = np.mean(popular_recalls)

print("Completed in {:.2f}s".format(time.time() - start))

print("Model: {}".format(model_mean_recall))
print("Popular: {}".format(popular_mean_recall))

Model: 0.14335396125057287
Popular: 0.06973728391982736
