In [2]:
# Imports

from pathlib import Path
from datetime import datetime

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle

In [3]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def timestamp():
    now = datetime.now()
    return "{}_{}_{}_{}{}".format(now.year, now.month, now.day, now.hour, now.minute)


def get_k_popular(k):
    popular_products = list(df_order_products_prior_merged["product_id"].value_counts().head(k).index)
    return popular_products

## Load datasets

In [5]:
def make_test_data(test_data_path):
    # Read train and test order csvs
    df_order_products_train = pd.read_csv("../data/order_products__train.csv")
#     df_order_products_submission = pd.read_csv("../data/sample_submission.csv")
#     current_order_user_df = df_orders.loc[(df_orders.eval_set == "train") | (df_orders.eval_set == "test")].reset_index()
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_train["order_id"].unique())# + len(df_order_products_submission["order_id"].unique())

    # Convert train and submission dataframes into the same format
    df_order_products_train = df_order_products_train[["order_id", "product_id"]]
    df_order_products_train = df_order_products_train.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
#     df_order_products_submission["products"] = df_order_products_submission["products"].apply(lambda p: p.strip().split())

#     test = set(df_order_products_submission["order_id"].unique().tolist())
#     train = set(df_order_products_train["order_id"].unique().tolist())
#     assert len(test & train) == 0

#     df_test_order_products = pd.concat([df_order_products_train, df_order_products_submission])
    df_test_order_products = df_order_products_train

    assert df_test_order_products.size == df_order_products_train.size # + df_order_products_submission.size
    assert df_test_order_products.size == current_order_user_df.size

    df_test_user_products = pd.merge(current_order_user_df, df_test_order_products, on="order_id")
    df_test_user_products = df_test_user_products[["user_id", "products"]]

    df_test_user_products.to_csv(test_data_path, index_label=False)


# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_orders = pd.read_csv("../data/orders.csv") 

# If test data csv doesn't exist already, create it
test_data_path = "../data/test_user_products1.csv"
if not Path(test_data_path).is_file():
    make_test_data(test_data_path)
df_test_user_products = pd.read_csv(test_data_path)

# Merge Prior orders and Products
df_products = pd.read_csv("../data/products.csv")
df_order_products_prior_merged = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

## Load Product Item Matrix

In [7]:
def build_product_item_matrix(matrix_path):
    # Consider only "prior" orders and remove all columns except order_id and user_id from df_orders
    prior_order_user = df_orders.loc[df_orders.eval_set == "prior"]
    prior_order_user = prior_order_user[["order_id", "user_id"]]

    # Remove all columns except order_id and user_id from df_orders
    prior_order_product = df_order_products_prior[["order_id", "product_id"]]
    
    merged_order_product_user = pd.merge(prior_order_user, prior_order_product, on="order_id")
    df_user_product = merged_order_product_user[["user_id", "product_id"]]
    df_user_product = df_user_product.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    users = list(np.sort(df_user_product.user_id.unique()))
    products = list(df_user_product.product_id.unique())
    quantity = list(df_user_product.quantity)

    col_indices = df_user_product.user_id.astype('category', categories = users).cat.codes 
    row_indices = df_user_product.product_id.astype('category', categories = products).cat.codes 
    print(col_indices)
    product_user_matrix = sparse.csr_matrix((quantity, (row_indices, col_indices)), shape=(len(products), len(users)))
    sparse.save_npz(matrix_path, product_user_matrix)


REBUILD_MATRIX = False
matrix_path = "../data/product_item_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_item_matrix(matrix_path)
product_user_matrix = sparse.load_npz(matrix_path)
# product_user_matrix

In [8]:
sparsity(product_user_matrix)

99.8700882953749

## Model

In [9]:
def build_als(prod_user_matrix, alpha, factors, regularization, iterations):
    # build model
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)
    model.approximate_similar_items = False
    model.fit((product_user_matrix * alpha).astype("double"))

    # save model to disk
    filepath = "../models/als/als_{}.model".format(timestamp())
    with open(filepath, "wb+") as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    return filepath

# Build Model
REBUILD_MODEL = False
model_path = "../models/als/als_2017_11_21_1338.model"
if REBUILD_MODEL or not Path(model_path).exists():
    model_path = build_als(product_user_matrix, alpha=40, factors=64, regularization=0.01, iterations=15)
with open(model_path, "rb") as f:
    als_model = pickle.load(f)

In [10]:
# recommend items for a user
user_id = 17
recommendations = als_model.recommend(user_id, product_user_matrix.T, N = 10)

In [29]:
# Actual 
row = df_test_user_products.loc[df_test_user_products.user_id == user_id]
actual = list(row["products"])
actual = actual[0][1:-1]
actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))
act_products = []
for pid in actual:
    act_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
print("Actual products bought by user: {}\n{}".format(user_id, act_products))

# Recommended
print("Recommendations for user: {}\n".format(user_id))
r = [r[0] for r in recommendations]
rec_products = []
for pid in r:
    rec_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
print("Recommended products bought by user: {}\n{}".format(user_id, rec_products))


Actual products bought by user: 17
['Grade A Extra Large Eggs', 'Select-A-Size Paper Towels, White, 2 Huge Rolls = 5 Regular Rolls  Towels/Napkins', 'Light Spread Butter Substitute', 'Strawberries', 'Raspberries', 'Ultra Soft Bathroom Tissue Double Rolls']
Recommendations for user: 17

Recommended products bought by user: 17
['Gochujang Sauce', 'Green Chile Anytime Sauce', 'Organic Honeycrisp Apples', 'XXX Acai Blueberry Pomegranate', 'Wheat Chex Cereal', 'Brioche Bachelor Loaf', 'Yogurt Strawberry Pomegranate', 'Organic Sprouted Barley Bread', 'Nutter Butter Cookie Bites Go-Pak', 'Fabric Softener, Geranium Scent']


In [158]:
user_id = 17
print("Actual products bought by user: {}\n".format(user_id))

row = df_test_user_products.loc[df_test_user_products.user_id == user_id]
actual = list(row["products"])
actual = actual[0][1:-1]
actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))
for pid in actual:
    print(df_products.loc[df_products.product_id == pid].product_name)

Actual products bought by user: 17

18533    Grade A Extra Large Eggs
Name: product_name, dtype: object
1216    Select-A-Size Paper Towels, White, 2 Huge Roll...
Name: product_name, dtype: object
12719    Light Spread Butter Substitute
Name: product_name, dtype: object
16796    Strawberries
Name: product_name, dtype: object
43351    Raspberries
Name: product_name, dtype: object
4373    Ultra Soft Bathroom Tissue Double Rolls
Name: product_name, dtype: object


In [97]:
r.intersection(actual)

set()

## Evaluation

In [105]:
popular_products = get_k_popular(10)

In [108]:
def recall_score(actual, pred):
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)


user_product_matrix = product_user_matrix.T.tocsr()
model_recalls = []
popular_recalls = []

for index, row in df_test_user_products.iterrows():
    try:
        actual = row["products"][1:-1]
        actual = [p.strip() for p in actual.strip().split(",")]
    
        recommended = als_model.recommend(row["user_id"], user_product_matrix, N=10)
        recommended = [r[0] for r in recommended]
        
        model_recalls.append(recall_score(actual, recommended))
        popular_recalls.append(recall_score(actual, popular_products))
        
        if index == 10:
            break
            
    except Exception as e:
        print("Index Error")
        print("index: {}".format(index))
        print("user_id: {}".format(row["user_id"]))
        print("actual: {}".format(actual))
        print("popular: {}".format(popular_products[:5]))
        raise e

    
model_mean_recall = np.mean(model_recalls)
popular_mean_recall = np.mean(popular_recalls)

print("Model: {}".format(model_mean_recall))
print("Popular: {}".format(popular_mean_recall))

Index Error
index: 131208
user_id: 206209
actual: ['6846', '9405', '24852', '40603', '15655', '42606', '37966', '39216']
popular: ['24852', '13176', '21137', '21903', '47209']


IndexError: index 206209 is out of bounds for axis 0 with size 206209

In [110]:
np.mean(model_recalls)

0.00012778417197992074

In [111]:
np.mean(popular_recalls)

0.069842730885500859

In [112]:
0.069*100

6.9