In [1]:
import os
import json
import pickle
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
from scipy.sparse import csr_matrix as sparse_matrix
from scipy.sparse import find
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics.pairwise import cosine_similarity

# Load the functions
import sys
sys.path.insert(1, '../src')
from preprocessing_eda import *

In [3]:
df_train = pd.read_csv("../data/clean/df_train.csv", index_col=0)

In [4]:
df_train.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
691810,A2O5EXPH8UKUS3,B001FSK8PS,5.0,1372204800
515687,A3I64V4AR136KY,B00121UVU0,1.0,1352246400
366115,A2N56YII67PSHS,B000P20RQU,5.0,1359072000
571174,A3RV5ZUA8W67FK,B0016P4P4K,5.0,1350432000
1664068,A2XRZV63X79YSJ,B008EKY8VK,4.0,1378252800


## Use the most similar item close to the highest rating the user gives to recommend

In [5]:
def recommend_for_user(data, user_id, N=5, user_key='UserId', item_key='ProductId'):
    """
    Use the cosine similarity between items to make recommendations for a given user
    
    Parameters
    ----------
    data: pandas.DataFrame
        The original dataframe that stores the users' ratings information
    user_id: str
        The ID of the user to make recommendations
    N: int (default=5)
        The number of recommendations
    ser_key: string
        The column in ratings that contains the users id
    item_key: string
        The column in ratings that contains the items id
    
    Return
    ------
    list of strings
        The list of IDs of the recommended items.
    """ 
    num_users = len(set(data[user_key]))
    num_products = len(set(data[item_key]))
    X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(data, n=num_products, d=num_users, user_key=user_key, item_key=item_key)
    Y = sparse_matrix((data["Rating"], (user_ind, item_ind)), shape=(num_users, num_products))
    # Set dense_output to False to ge the sparse represent
    similarity_matrix = cosine_similarity(X, dense_output=False)
    
    user_ind = user_mapper[user_id]
    arr1, arr2 = Y[user_ind].nonzero()
    
    ratings = []
    for i,j in zip(arr1, arr2):
        ratings.append(Y[user_ind][i, j])
    max_rating = np.max(ratings)
    max_rating_ind = np.argmax(ratings)
    
    similar_arr1, similar_arr2 = similarity_matrix[arr2[max_rating_ind]].nonzero()
    
    similar_dict = {}
    for i, j  in zip(similar_arr1, similar_arr2):
        similar_dict[j] = similarity_matrix[arr2[max_rating_ind]][i,j]
    
    recom_list = sorted(similar_dict, key=similar_dict.get, reverse=True)
    
    res = []
    
    for ind in recom_list[1:]:
        if Y[user_ind][0, ind] == 0:
            res.append(item_inverse_mapper[ind])
        if len(res) >= N:
            break
    return res

In [22]:
result = recommend_for_user(df_train, user_id='A3RV5ZUA8W67FK')

In [23]:
for item in result:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item, 
                                      item)))

In [25]:
for item in df_train[df_train['UserId']=='A3RV5ZUA8W67FK']['ProductId']:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item, 
                                      item)))

In [11]:
num_users = len(set(df_train['UserId']))
num_products = len(set(df_train['ProductId']))
X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(df_train, n=num_products, d=num_users, user_key='UserId', item_key='ProductId')

In [26]:
for item in result:
    print(cosine_similarity(X[item_mapper['B0016P4P4K']], X[item_mapper[item]]))

[[0.10166046]]
[[0.06562154]]
[[0.05750784]]
[[0.05328456]]
[[0.05083023]]


## Use the weighted average rating to recommend

In [27]:
def IB_CF(data, user_id, N=5, user_key='UserId', item_key='ProductId'):
    """
    
    """
    num_users = len(set(data[user_key]))
    num_products = len(set(data[item_key]))
    X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(data, n=num_products, d=num_users, user_key=user_key, item_key=item_key)
    Y = sparse_matrix((data["Rating"], (user_ind, item_ind)), shape=(num_users, num_products))
    # Set dense_output to False to ge the sparse represent
    similarity_matrix = cosine_similarity(X, dense_output=False)
    
    user = user_mapper[user_id]
    ratings = np.dot(Y[user], similarity_matrix)
    _, arr = ratings.nonzero()
    res = {}
    for ind in arr:
        if Y[user][0,ind] == 0:
            res[ind] = ratings[0, ind]
            
    result = sorted(res, key=res.get, reverse=True)[:N]
    return result

In [28]:
result1 = IB_CF(df_train, user_id='A3RV5ZUA8W67FK')

In [29]:
for item_ind in result1:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item_inverse_mapper[item_ind], 
                                      item_inverse_mapper[item_ind])))

In [30]:
result1

[7826, 80749, 168527, 173611, 133298]

In [32]:
for item_ind in result1:
    print(cosine_similarity(X[item_mapper['B0016P4P4K']], X[item_ind]))

[[0.10166046]]
[[0.06562154]]
[[0.05750784]]
[[0.05328456]]
[[0.05083023]]


## Sanity check

In [33]:
similarity_matrix = cosine_similarity(X, dense_output=False)

In [38]:
_, temp_arr = similarity_matrix[item_mapper['B0016P4P4K']].nonzero()

In [39]:
my_dict = {}
for ind in temp_arr:
    my_dict[ind] = similarity_matrix[item_mapper['B0016P4P4K']][0, ind]
sorted_ind = sorted(my_dict, key=my_dict.get, reverse=True)

In [41]:
sorted_ind[1:6]

[7826, 80749, 168527, 173611, 133298]