In [2]:
# Imports

from pathlib import Path
from datetime import datetime

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle

In [30]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def timestamp():
    now = datetime.now()
    return "{}_{}_{}_{}{}".format(now.year, now.month, now.day, now.hour, now.minute)

## Load datasets

In [4]:
def make_test_data(test_data_path):
    # Read train and test order csvs
    df_order_products_train = pd.read_csv("../data/order_products__train.csv")
    df_order_products_submission = pd.read_csv("../data/sample_submission.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "train") | (df_orders.eval_set == "test")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_train["order_id"].unique()) + len(df_order_products_submission["order_id"].unique())

    # Convert train and submission dataframes into the same format
    df_order_products_train = df_order_products_train[["order_id", "product_id"]]
    df_order_products_train = df_order_products_train.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    df_order_products_submission["products"] = df_order_products_submission["products"].apply(lambda p: p.strip().split())

    test = set(df_order_products_submission["order_id"].unique().tolist())
    train = set(df_order_products_train["order_id"].unique().tolist())
    assert len(test & train) == 0

    df_test_order_products = pd.concat([df_order_products_train, df_order_products_submission])

    assert df_test_order_products.size == df_order_products_train.size + df_order_products_submission.size
    assert df_test_order_products.size == current_order_user_df.size

    df_test_user_products = pd.merge(current_order_user_df, df_test_order_products, on="order_id")
    df_test_user_products = df_test_user_products[["user_id", "products"]]

    df_test_user_products.to_csv(test_data_path, index_label=False)


# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_orders = pd.read_csv("../data/orders.csv") # If test data csv doesn't exist already, create it

test_data_path = "../data/test_user_products.csv"
if not Path(test_data_path).is_file():
    make_test_data(test_data_path)
df_test_user_products = pd.read_csv(test_data_path)

In [5]:
# Merge Prior orders and Products
df_products = pd.read_csv("../data/products.csv")

df_order_products_prior_merged = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [6]:
df_order_products_prior_merged.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2,33120,1,1,Organic Egg Whites,86,16
1,2,28985,2,1,Michigan Organic Kale,83,4
2,2,9327,3,0,Garlic Powder,104,13
3,2,45918,4,1,Coconut Butter,19,13
4,2,30035,5,0,Natural Sweetener,17,13


## Load Product Item Matrix

In [7]:
def build_product_item_matrix(matrix_path):
    # Consider only "prior" orders and remove all columns except order_id and user_id from df_orders
    prior_order_user = df_orders.loc[df_orders.eval_set == "prior"]
    prior_order_user = prior_order_user[["order_id", "user_id"]]

    # Remove all columns except order_id and user_id from df_orders
    prior_order_product = df_order_products_prior[["order_id", "product_id"]]
    
    merged_order_product_user = pd.merge(prior_order_user, prior_order_product, on="order_id")
    df_user_product = merged_order_product_user[["user_id", "product_id"]]
    df_user_product = df_user_product.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    users = list(np.sort(df_user_product.user_id.unique()))
    products = list(df_user_product.product_id.unique())
    quantity = list(df_user_product.quantity)

    col_indices = df_user_product.user_id.astype('category', categories = users).cat.codes 
    row_indices = df_user_product.product_id.astype('category', categories = products).cat.codes 
    product_user_matrix = sparse.csr_matrix((quantity, (row_indices, col_indices)), shape=(len(products), len(users)))
    sparse.save_npz(matrix_path, product_user_matrix)

    
matrix_path = "../data/product_item_matrix.npz"
if not Path(matrix_path).is_file():
    build_product_item_matrix(matrix_path)
product_user_matrix = sparse.load_npz(matrix_path)
product_user_matrix

<49677x206209 sparse matrix of type '<class 'numpy.int64'>'
	with 13307953 stored elements in Compressed Sparse Row format>

In [8]:
sparsity(product_user_matrix)

99.8700882953749

## Model

In [34]:
def build_als(factors, regularization, iterations):
    # build model
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)
    # save model to disk
    filepath = "../models/als/als_{}".format(timestamp())
    with open(filepath, "wb+") as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    return filepath

# Build Model
REBUILD_MODEL = False
model_path = "../models/als/als_2017_11_21_050.model"
if REBUILD_MODEL or not Path(model_path).exists():
    model_path = build_als(factors=64, regularization=0.1, iterations=50)
with open(model_path, "rb") as f:
    als_model = pickle.load(f)

In [35]:
# Fit model
alpha = 40
als_model.approximate_similar_items = False
als_model.fit(product_user_matrix.astype("double"))

In [36]:
# recommend items for a user
user_id = 3
recommendations = als_model.recommend(user_id, product_user_matrix.T, N = 10)

In [37]:
print("Recommendations for user: {}".format(user_id))
[r[0] for r in recommendations]

Recommendations for user: 3


[93, 55, 968, 157, 1845, 68, 199, 468, 111, 1649]

In [39]:
print("Actual products bought by user: {}".format(user_id))
df_test_user_products.loc[df_test_user_products.user_id == user_id]

Actual products bought by user: 3


Unnamed: 0,user_id,products
2,3,"['39276', '29259']"


## Evaluation

In [50]:
def recall_score(actual, pred):
    if len(actual) != len(pred):
        raise ValueError("Input Vector size mismatch")
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(pred)

model_recalls = []
popular_recalls = []
popular_products = df_order_products_prior_merged["product_id"].value_counts().head(50).index
user_product_matrix = product_user_matrix.T.tocsr()

for index, row in df_test_user_products.iterrows():
    try:
        actual = row["products"][1:-1]
        actual = [p.strip() for p in actual.strip().split(",")]
    
        recommended = model.recommend(row["user_id"], user_product_matrix, N=len(actual))
        recommended = [str(r[0]) for r in recommended]
        
        model_recalls.append(recall_score(actual, recommended))
        popular_recalls.append(recall_score(actual, popular_products[:len(actual)]))
        if index == 100:
            break
            
    except IndexError as e:
        print("Index Error")
        print("index: {}".format(index))
        print("row: {}".format(row))
        print("actual: {}".format(actual))
        raise e

    
model_mean_recall = np.mean(model_recalls)
popular_mean_recall = np.mean(popular_recalls)

print("Model: {}".format(model_mean_recall))
print("Popular: {}".format(popular_mean_recall))

ValueError: Input Vector size mismatch