In [1]:
### Imports
import pandas as pd
import numpy as np
import sys
import scipy.sparse as sparse
import scipy.sparse.linalg as linalg
from scipy.sparse import coo_matrix, csr_matrix
from numpy import bincount, log, sqrt
import itertools
import time
from pathlib import Path

In [2]:
# path for data files
base_path="../data/"
base_path_factors='../factors/'
product_user_matrix_path=base_path+"product_user_matrix.npz"
product_factors_svd_path=base_path_factors+"product_factors_svd"
user_factors_svd_path=base_path_factors+"user_factors_svd"
test_data_path = base_path+'user_products__test.csv'


# Load datasets

In [3]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_order_products_train = pd.read_csv("../data/order_products__train.csv")
df_orders = pd.read_csv("../data/orders.csv") 

# Products
df_products = pd.read_csv("../data/products.csv")

In [4]:
def make_test_data(test_data_path, df_orders, df_order_products_train):
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]

    # Write to disk
    df_user_products_test.to_csv(test_data_path, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get test data
REBUILD_TEST_DATA = False
test_data_path = "../data/user_products__test.csv"
if REBUILD_TEST_DATA or not Path(test_data_path).is_file():
    make_test_data(test_data_path, df_orders, df_order_products_train)
df_user_products_test = pd.read_csv(test_data_path)


# Loading product User Matrix

In [5]:
product_user_matrix=sparse.load_npz(product_user_matrix_path).tocsr()

# BM_25 weight of each row

In [8]:
def bm25_weight(X, K1=100, B=0.8):
    """ Weighs each row of a sparse matrix X  by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)

    N = float(X.shape[0])
    idf = log(N / (1 + bincount(X.col)))

    # calculate length_norm per document (product)
    row_sums = np.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X

# Calculating User and product factors

In [9]:
#Calculating the product and user factors
product_factors, _, user_factors = linalg.svds(bm25_weight(product_user_matrix), 50)

In [10]:
# saving the user and product factors
np.save(product_factors_svd_path, product_factors)
np.save(user_factors_svd_path, user_factors)
           

In [11]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)

In [12]:
# Checking the sparsity
sparsity(product_user_matrix)

99.8700882953749

In [13]:
# To find the top related items
class TopRelated(object):
    def __init__(self, product_factors):
        # fully normalize artist_factors, so can compare with only the dot product
        norms = np.linalg.norm(product_factors, axis=-1)
        self.factors = product_factors / norms[:, np.newaxis]

    def get_related(self, product_id, N=10):
        scores = self.factors.dot(self.factors[product_id])
        best = np.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

In [14]:
# To find the top recommended items
class TopRecommended(object):
    def __init__(self, product_factors,user_factors):
        self.product_factors =product_factors
        self.user_factors =user_factors

    def get_recommended(self, user_id, N=10):
        scores =  self.user_factors.T[user_id].dot(self.product_factors.T)
        best = np.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])
    
    def recommend_new(self, userid, N=10):        
        user = self.user_factors.T[userid]
#         calculate the top N items, removing the users own liked items from the results
        liked = product_user_matrix.T[userid].indices
        scores =  user.dot(self.product_factors.T)
        count = N + len(liked)
        if count < len(scores):
            ids = np.argpartition(scores, -count)[-count:]
            best = sorted(zip(ids, scores[ids]), key=lambda x: -x[1])
        else:
            best = sorted(enumerate(scores), key=lambda x: -x[1])
        return list(itertools.islice((rec for rec in best if rec[0] not in liked), N))    

In [15]:
# Initializing class which returns top recommended items for a user_id
tp_recm=TopRecommended(product_factors,user_factors)

In [16]:
# calculate top new recommended Item for a user
tp_recm.recommend_new(2)

[(17536, 4.8472453265396875e-05),
 (23507, 3.7501003169646856e-05),
 (18421, 3.3659225008292046e-05),
 (5754, 3.2425897178474256e-05),
 (19314, 3.2371654889907893e-05),
 (19111, 3.215674286486141e-05),
 (46219, 2.7867475059536311e-05),
 (1711, 2.7027152450202514e-05),
 (46368, 2.6843099919577224e-05),
 (30652, 2.6331188399805843e-05)]

In [17]:
# calculate top recommended Item for a user
tp_recm.get_recommended(2)

[(17536, 4.8472453265396875e-05),
 (23507, 3.7501003169646856e-05),
 (18421, 3.3659225008292046e-05),
 (5754, 3.2425897178474256e-05),
 (19314, 3.2371654889907893e-05),
 (19111, 3.215674286486141e-05),
 (46219, 2.7867475059536311e-05),
 (1711, 2.7027152450202514e-05),
 (46368, 2.6843099919577224e-05),
 (30652, 2.6331188399805843e-05)]

# Evaluation

In [18]:
#Helper Functions
def get_k_popular(k, df_order_products_prior):
    popular_products = list(df_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products

In [19]:
popular_products = get_k_popular(10, df_order_products_prior)

In [22]:
# Iterrows approach

start = time.time()
print("Evaluating model ...")
    
def recall_score(actual, pred):
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)


user_product_matrix = product_user_matrix.T 
model_recalls = []
popular_recalls = []

for index, row in df_user_products_test.head(1000).iterrows():
    actual = row["products"][1:-1]
    actual = [int(p.strip()) for p in actual.strip().split(",")]
    recommended = tp_recm.recommend_new(row["user_id"], N=10)
    recommended = [r[0] for r in recommended]
    model_recalls.append(recall_score(actual, recommended))
    popular_recalls.append(recall_score(actual, popular_products))

    
model_mean_recall = np.mean(model_recalls)
popular_mean_recall = np.mean(popular_recalls)

print("Completed in {:.2f}s".format(time.time() - start))

print("Model: {}".format(model_mean_recall))
print("Popular: {}".format(popular_mean_recall))


Evaluating model ...
Completed in 70.76s
Model: 7.142857142857142e-05
Popular: 0.06694641505105668
