In [27]:
### Imports
import pandas as pd
import numpy as np
import sys
import scipy.sparse as sparse
import scipy.sparse.linalg as linalg
from scipy.sparse import coo_matrix, csr_matrix
from numpy import bincount, log, sqrt
import itertools

In [64]:
# path for data files
base_path="../data/"
user_product_matrix_path=base_path+"user_product_matrix.npz"
product_factors_svd_path=base_path+"product_factors_svd"
user_factors_svd_path=base_path+"user_factors_svd"

In [65]:
product_user_matrix=sparse.load_npz(user_product_matrix_path)

In [32]:
def bm25_weight(X, K1=100, B=0.8):
    """ Weighs each row of a sparse matrix X  by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)

    N = float(X.shape[0])
    idf = log(N / (1 + bincount(X.col)))

    # calculate length_norm per document (product)
    row_sums = np.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X

In [68]:
#Calculating the product and user factors
product_factors, _, user_factors = linalg.svds(bm25_weight(product_user_matrix), 50)

In [67]:
# saving the user and product factors
np.save(product_factors_svd_path, product_factors)
np.save(user_factors_svd_path, user_factors)
           

In [109]:
# Helper Functions

def sparsity(matrix):
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)

In [69]:
# To find the top related items
class TopRelated(object):
    def __init__(self, product_factors):
        # fully normalize artist_factors, so can compare with only the dot product
        norms = np.linalg.norm(product_factors, axis=-1)
        self.factors = product_factors / norms[:, np.newaxis]

    def get_related(self, product_id, N=10):
        scores = self.factors.dot(self.factors[product_id])
        best = np.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])

In [137]:
# To find the top recommended items
class TopRecommended(object):
    def __init__(self, product_factors,user_factors):
        self.product_factors =product_factors
        self.user_factors =user_factors

    def get_recommended(self, user_id, N=10):
        scores =  self.user_factors.T[user_id].dot(self.product_factors.T)
        best = np.argpartition(scores, -N)[-N:]
        return sorted(zip(best, scores[best]), key=lambda x: -x[1])
    
    def recommend_new(self, userid, N=10):        
        user = self.user_factors.T[userid]
        print(user.shape)
        print(self.product_factors.T.shape)
#         calculate the top N items, removing the users own liked items from the results
        liked = product_user_matrix[userid].indices
        print(len(liked))
        scores =  user.dot(self.product_factors.T)
        print(scores.shape)
        count = N + len(liked)
        if count < len(scores):
            ids = np.argpartition(scores, -count)[-count:]
            best = sorted(zip(ids, scores[ids]), key=lambda x: -x[1])
        else:
            best = sorted(enumerate(scores), key=lambda x: -x[1])
        return list(itertools.islice((rec for rec in best if rec[0] not in liked), N))    

In [138]:
# Initializing class which returns top recommended items for a user_id
tp_recm=TopRecommended(product_factors,user_factors)

In [139]:
# calculate top new recommended Item for a user
tp_recm.recommend_new(2)

(50,)
(50, 49677)
1923
(49677,)


NameError: name 'itertools' is not defined

In [92]:
# calculate top recommended Item for a user
tp_recm.get_recommended(2)

[(31792, 4.8472453265396231e-05),
 (35271, 3.7501003169646341e-05),
 (40554, 3.3659225008291585e-05),
 (5205, 3.2425897178474229e-05),
 (26868, 3.237165488990731e-05),
 (3184, 3.2156742864861356e-05),
 (3209, 2.7867475059536274e-05),
 (3780, 2.7027152450202484e-05),
 (15325, 2.6843099919577234e-05),
 (1222, 2.6331188399805826e-05)]

In [None]:
#Analyzing the results