In [1]:
# Imports

from pathlib import Path
import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np

In [25]:
# Helper Functions

def make_test_data(test_data_path):
    # Read train and test order csvs
    df_order_products_train = pd.read_csv("../data/order_products__train.csv")
    df_order_products_submission = pd.read_csv("../data/sample_submission.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "train") | (df_orders.eval_set == "test")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_train["order_id"].unique()) + len(df_order_products_submission["order_id"].unique())

    # Convert train and submission dataframes into the same format
    df_order_products_train = df_order_products_train[["order_id", "product_id"]]
    df_order_products_train = df_order_products_train.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    df_order_products_submission["products"] = df_order_products_submission["products"].apply(lambda p: p.strip().split())

    test = set(df_order_products_submission["order_id"].unique().tolist())
    train = set(df_order_products_train["order_id"].unique().tolist())
    assert len(test & train) == 0

    df_test_order_products = pd.concat([df_order_products_train, df_order_products_submission])

    assert df_test_order_products.size == df_order_products_train.size + df_order_products_submission.size
    assert df_test_order_products.size == current_order_user_df.size

    df_test_user_products = pd.merge(current_order_user_df, df_test_order_products, on="order_id")
    df_test_user_products = df_test_user_products[["user_id", "products"]]

    df_test_user_products.to_csv(test_data_path, index_label=False)


def build_product_item_matrix(matrix_path):
    # Consider only "prior" orders and remove all columns except order_id and user_id from df_orders
    prior_order_user = df_orders.loc[df_orders.eval_set == "prior"]
    prior_order_user = prior_order_user[["order_id", "user_id"]]

    # Remove all columns except order_id and user_id from df_orders
    prior_order_product = df_order_products_prior[["order_id", "product_id"]]
    
    merged_order_product_user = pd.merge(prior_order_user, prior_order_product, on="order_id")
    df_user_product = merged_order_product_user[["user_id", "product_id"]]
    df_user_product = df_user_product.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    users = list(np.sort(df_user_product.user_id.unique()))
    products = list(df_user_product.product_id.unique())
    quantity = list(df_user_product.quantity)

    col_indices = df_user_product.user_id.astype('category', categories = users).cat.codes 
    row_indices = df_user_product.product_id.astype('category', categories = products).cat.codes 
    product_user_matrix = sparse.csr_matrix((quantity, (row_indices, col_indices)), shape=(len(products), len(users)))
    sparse.save_npz(matrix_path, product_user_matrix)



def sparsity(matrix):
    total_size = product_user_matrix.shape[0] * product_user_matrix.shape[1]
    actual_size = product_user_matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)

## Load datasets

In [21]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_orders = pd.read_csv("../data/orders.csv") # If test data csv doesn't exist already, create it

test_data_path = "../data/test_user_products.csv"
if not Path(test_data_path).is_file():
    make_test_data(test_data_path)
df_test_user_products = pd.read_csv(test_data_path)

## Load Product Item Matrix

In [26]:
matrix_path = "../data/product_item_matrix.npz"
if not Path(matrix_path).is_file():
    build_product_item_matrix(matrix_path)
product_item_matrix = sparse.load_npz(matrix_path)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 11, saw 3


## Fit ALS model

In [213]:
alpha = 40

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=50)

# train the model on a sparse matrix of item/user/confidence weights
model.fit((product_user_matrix * alpha).astype("double"))

In [453]:
# recommend items for a user
user_product = product_user_matrix.T.tocsr()
recommendations = model.recommend(3, user_product)

In [454]:
recommendations

[(2628, 0.64243280626946464),
 (1612, 0.63063645073686225),
 (5, 0.61472722835790572),
 (72, 0.58491845959547506),
 (131, 0.58118900190071177),
 (132, 0.57183228826313393),
 (1174, 0.56919853368919515),
 (243, 0.5662215853151904),
 (1018, 0.56375476302698635),
 (2162, 0.55582760599714343)]