In [447]:
### Imports
from pathlib import Path
import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np

In [446]:
# Functions

def make_test_data(test_data_path):
    # Read train and test order csvs
    df_order_products_train = pd.read_csv("../data/order_products__train.csv")
    df_order_products_submission = pd.read_csv("../data/sample_submission.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "train") | (df_orders.eval_set == "test")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_train["order_id"].unique()) + len(df_order_products_submission["order_id"].unique())

    # Convert train and submission dataframes into the same format
    df_order_products_train = df_order_products_train[["order_id", "product_id"]]
    df_order_products_train = df_order_products_train.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    df_order_products_submission["products"] = df_order_products_submission["products"].apply(lambda p: p.strip().split())

    test = set(df_order_products_submission["order_id"].unique().tolist())
    train = set(df_order_products_train["order_id"].unique().tolist())
    assert len(test & train) == 0

    df_test_order_products = pd.concat([df_order_products_train, df_order_products_submission])

    assert df_test_order_products.size == df_order_products_train.size + df_order_products_submission.size
    assert df_test_order_products.size == current_order_user_df.size

    df_test_user_products = pd.merge(current_order_user_df, df_test_order_products, on="order_id")
    df_test_user_products = df_test_user_products[["user_id", "products"]]

    df_test_user_products.to_csv(test_data_path, index_label=False)

## Load datasets

In [448]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_orders = pd.read_csv("../data/orders.csv")

In [449]:
# If test data csv doesn't exist already, create it
test_data_path = "../data/test_user_products.csv"
if not Path(test_data_path).is_file():
    make_test_data(test_data_path)
    df_test_user_products = pd.read_csv(test_data_path)

In [450]:
df_test_user_products.head()

Unnamed: 0,user_id,products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,3,"['39276', '29259']"
3,4,"['39276', '29259']"
4,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."


In [136]:
df_order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [137]:
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


## Clean data

In [138]:
# Consider only "prior" orders and remove all columns except order_id and user_id from df_orders
prior_order_user = df_orders.loc[df_orders.eval_set == "prior"]
prior_order_user = prior_order_user[["order_id", "user_id"]]

In [139]:
prior_order_user.head()

Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1


In [140]:
# Remove all columns except order_id and user_id from df_orders
prior_order_product = df_order_products_prior[["order_id", "product_id"]]

In [141]:
prior_order_product.head()

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035


## Build user product matrix

In [178]:
merged_order_product_user = pd.merge(prior_order_user, prior_order_product, on="order_id")

In [179]:
merged_order_product_user.head()

Unnamed: 0,order_id,user_id,product_id
0,2539329,1,196
1,2539329,1,14084
2,2539329,1,12427
3,2539329,1,26088
4,2539329,1,26405


In [193]:
df_user_product = merged_order_product_user[["user_id", "product_id"]]

Unnamed: 0,user_id,product_id
0,1,196
1,1,14084
2,1,12427
3,1,26088
4,1,26405


In [194]:
df_user_product = df_user_product.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
df_user_product.head()

Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


In [195]:
df_user_product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13307953 entries, 0 to 13307952
Data columns (total 3 columns):
user_id       int64
product_id    int64
quantity      int64
dtypes: int64(3)
memory usage: 304.6 MB


In [201]:
users = list(np.sort(df_user_product.user_id.unique()))
products = list(df_user_product.product_id.unique())
quantity = list(df_user_product.quantity)

col_indices = df_user_product.user_id.astype('category', categories = users).cat.codes 
row_indices = df_user_product.product_id.astype('category', categories = products).cat.codes 

product_user_matrix = sparse.csr_matrix((quantity, (row_indices, col_indices)), shape=(len(products), len(users)))
product_user_matrix

<49677x206209 sparse matrix of type '<class 'numpy.int64'>'
	with 13307953 stored elements in Compressed Sparse Row format>

In [210]:
# Compute Sparsity
total_size = product_user_matrix.shape[0] * product_user_matrix.shape[1]
actual_size = product_user_matrix.size
sparsity = (1 - (actual_size / total_size)) * 100
sparsity

99.8700882953749

## Fit ALS model

In [213]:
alpha = 40

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=50)

# train the model on a sparse matrix of item/user/confidence weights
model.fit((product_user_matrix * alpha).astype("double"))

In [453]:
# recommend items for a user
user_product = product_user_matrix.T.tocsr()
recommendations = model.recommend(3, user_product)

In [454]:
recommendations

[(2628, 0.64243280626946464),
 (1612, 0.63063645073686225),
 (5, 0.61472722835790572),
 (72, 0.58491845959547506),
 (131, 0.58118900190071177),
 (132, 0.57183228826313393),
 (1174, 0.56919853368919515),
 (243, 0.5662215853151904),
 (1018, 0.56375476302698635),
 (2162, 0.55582760599714343)]