In [33]:
import os
import json
import pickle
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from IPython.core.display import display, HTML

from datetime import datetime

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

# Load the functions
import sys
sys.path.insert(1, '../src')
from preprocessing_eda import *

In [2]:
if not os.path.isfile("../data/clean/TrainData.pkl"):
    df_train.sort_values(by = "Timestamp", inplace = True)
    df_train.reset_index(drop = True, inplace = True)
    df_train.to_pickle("../data/clean/TrainData.pkl")
else:
    df_train = pd.read_pickle("../data/clean/TrainData.pkl")

if not os.path.isfile("../data/clean/TestData.pkl"):
    df_test.sort_values(by = "Timestamp", inplace = True)
    df_test.reset_index(drop = True, inplace = True)
    df_test.to_pickle("../data/clean/TestData.pkl")
else:
    df_test = pd.read_pickle("../data/clean/TestData.pkl")

In [3]:
df_train.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A27ZIX4Y4A8M65,B000005J9Q,4.0,1998-10-18 17:00:00
1,AXUC3MVPBT57K,B000005Z5L,5.0,1998-11-28 16:00:00
2,A18XELRYWMR80B,B00000IAI4,5.0,1999-03-15 17:00:00
3,A22S1QH6GDUE1V,B000026BTH,5.0,1999-06-02 17:00:00
4,AUAZWQ8DULN43,B0000014DT,4.0,1999-09-13 17:00:00


## Use the most similar item close to the highest rating the user gives to recommend

In [4]:
def recommend_for_user(data, user_id, N=5, user_key='UserId', item_key='ProductId'):
    """
    Use the cosine similarity between items to make recommendations for a given user
    
    Parameters
    ----------
    data: pandas.DataFrame
        The original dataframe that stores the users' ratings information
    user_id: str
        The ID of the user to make recommendations
    N: int (default=5)
        The number of recommendations
    ser_key: string
        The column in ratings that contains the users id
    item_key: string
        The column in ratings that contains the items id
    
    Return
    ------
    list of strings
        The list of IDs of the recommended items.
    """ 
    num_users = len(set(data[user_key]))
    num_products = len(set(data[item_key]))
    X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(data, n=num_products, d=num_users, user_key=user_key, item_key=item_key)
    Y = X.T
    # Set dense_output to False to ge the sparse represent
    similarity_matrix = cosine_similarity(X, dense_output=False)
    
    user_ind = user_mapper[user_id]
    arr1, arr2 = Y[user_ind].nonzero()
    
    ratings = []
    for i,j in zip(arr1, arr2):
        ratings.append(Y[user_ind][i, j])
    max_rating = np.max(ratings)
    max_rating_ind = np.argmax(ratings)
    
    similar_arr1, similar_arr2 = similarity_matrix[arr2[max_rating_ind]].nonzero()
    
    similar_dict = {}
    for i, j  in zip(similar_arr1, similar_arr2):
        similar_dict[j] = similarity_matrix[arr2[max_rating_ind]][i,j]
    
    recom_list = sorted(similar_dict, key=similar_dict.get, reverse=True)
    
    res = []
    
    for ind in recom_list[1:]:
        if Y[user_ind][0, ind] == 0:
            res.append(item_inverse_mapper[ind])
        if len(res) >= N:
            break
    return res

In [5]:
result = recommend_for_user(df_train, user_id='A3RV5ZUA8W67FK')

In [6]:
for item in result:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item, 
                                      item)))

In [7]:
for item in df_train[df_train['UserId']=='A3RV5ZUA8W67FK']['ProductId']:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item, 
                                      item)))

In [23]:
num_users = len(set(df_train['UserId']))
num_products = len(set(df_train['ProductId']))
X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(df_train, n=num_products, d=num_users, user_key='UserId', item_key='ProductId')

In [9]:
for item in result:
    print(cosine_similarity(X[item_mapper['B0016P4P4K']], X[item_mapper[item]]))

[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]


## Use the weighted average rating to recommend

In [10]:
def IB_CF(data, user_id, N=5, user_key='UserId', item_key='ProductId'):
    """
    
    """
    num_users = len(set(data[user_key]))
    num_products = len(set(data[item_key]))
    X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(data, n=num_products, d=num_users, user_key=user_key, item_key=item_key)
    Y = sparse_matrix((data["Rating"], (user_ind, item_ind)), shape=(num_users, num_products))
    # Set dense_output to False to ge the sparse represent
    similarity_matrix = cosine_similarity(X, dense_output=False)
    
    user = user_mapper[user_id]
    ratings = np.dot(Y[user], similarity_matrix)
    _, arr = ratings.nonzero()
    res = {}
    for ind in arr:
        if Y[user][0,ind] == 0:
            res[ind] = ratings[0, ind]
            
    result = sorted(res, key=res.get, reverse=True)[:N]
    return result

In [11]:
result1 = IB_CF(df_train, user_id='A3RV5ZUA8W67FK')

In [12]:
for item_ind in result1:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item_inverse_mapper[item_ind], 
                                      item_inverse_mapper[item_ind])))

In [13]:
result1

[121081, 123545, 138436, 8723, 194851]

In [14]:
for item_ind in result1:
    print(cosine_similarity(X[item_mapper['B0016P4P4K']], X[item_ind]))

[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]


## Sanity check

In [15]:
similarity_matrix = cosine_similarity(X, dense_output=False)

In [16]:
_, temp_arr = similarity_matrix[item_mapper['B0016P4P4K']].nonzero()

In [17]:
my_dict = {}
for ind in temp_arr:
    my_dict[ind] = similarity_matrix[item_mapper['B0016P4P4K']][0, ind]
sorted_ind = sorted(my_dict, key=my_dict.get, reverse=True)

In [18]:
sorted_ind[1:6]

[31941, 194851, 8723, 138436, 123545]

In [20]:
for i in result:
    print(item_mapper[i])

31941
194851
8723
138436
123545
