In [1]:
import os
import json
import pickle
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
from IPython.core.display import display, HTML

from datetime import datetime

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

# Load the functions
import sys
sys.path.insert(1, '../src')
from preprocessing_eda import *

In [2]:
if not os.path.isfile("../data/clean/TrainData.pkl"):
    df_train.sort_values(by = "Timestamp", inplace = True)
    df_train.reset_index(drop = True, inplace = True)
    df_train.to_pickle("../data/clean/TrainData.pkl")
else:
    df_train = pd.read_pickle("../data/clean/TrainData.pkl")

if not os.path.isfile("../data/clean/TestData.pkl"):
    df_test.sort_values(by = "Timestamp", inplace = True)
    df_test.reset_index(drop = True, inplace = True)
    df_test.to_pickle("../data/clean/TestData.pkl")
else:
    df_test = pd.read_pickle("../data/clean/TestData.pkl")

In [3]:
df_train.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A27ZIX4Y4A8M65,B000005J9Q,4.0,1998-10-18 17:00:00
1,AXUC3MVPBT57K,B000005Z5L,5.0,1998-11-28 16:00:00
2,A18XELRYWMR80B,B00000IAI4,5.0,1999-03-15 17:00:00
3,A22S1QH6GDUE1V,B000026BTH,5.0,1999-06-02 17:00:00
4,AUAZWQ8DULN43,B0000014DT,4.0,1999-09-13 17:00:00


## Use the most similar item close to the highest rating the user gives to recommend

In [4]:
def recommend_for_user(data, user_id, N=5, user_key='UserId', item_key='ProductId'):
    """
    Use the cosine similarity between items to make recommendations for a given user
    
    Parameters
    ----------
    data: pandas.DataFrame
        The original dataframe that stores the users' ratings information
    user_id: str
        The ID of the user to make recommendations
    N: int (default=5)
        The number of recommendations
    ser_key: string
        The column in ratings that contains the users id
    item_key: string
        The column in ratings that contains the items id
    
    Return
    ------
    list of strings
        The list of IDs of the recommended items.
    """ 
    num_users = len(set(data[user_key]))
    num_products = len(set(data[item_key]))
    X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(data, n=num_products, d=num_users, user_key=user_key, item_key=item_key)
    Y = X.T
    # Set dense_output to False to ge the sparse represent
    similarity_matrix = cosine_similarity(X, dense_output=False)
    
    user_ind = user_mapper[user_id]
    arr1, arr2 = Y[user_ind].nonzero()
    
    ratings = []
    for i,j in zip(arr1, arr2):
        ratings.append(Y[user_ind][i, j])
    max_rating = np.max(ratings)
    max_rating_ind = np.argmax(ratings)
    
    similar_arr1, similar_arr2 = similarity_matrix[arr2[max_rating_ind]].nonzero()
    
    similar_dict = {}
    for i, j  in zip(similar_arr1, similar_arr2):
        similar_dict[j] = similarity_matrix[arr2[max_rating_ind]][i,j]
    
    recom_list = sorted(similar_dict, key=similar_dict.get, reverse=True)
    
    res = []
    
    for ind in recom_list[1:]:
        if Y[user_ind][0, ind] == 0:
            res.append(item_inverse_mapper[ind])
        if len(res) >= N:
            break
    return res

In [5]:
result = recommend_for_user(df_train, user_id='A3RV5ZUA8W67FK')

In [6]:
for item in result:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item, 
                                      item)))

In [7]:
for item in df_train[df_train['UserId']=='A3RV5ZUA8W67FK']['ProductId']:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item, 
                                      item)))

In [23]:
num_users = len(set(df_train['UserId']))
num_products = len(set(df_train['ProductId']))
X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(df_train, n=num_products, d=num_users, user_key='UserId', item_key='ProductId')

In [9]:
for item in result:
    print(cosine_similarity(X[item_mapper['B0016P4P4K']], X[item_mapper[item]]))

[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]


## Use the weighted average rating to recommend

In [10]:
def IB_CF(data, user_id, N=5, user_key='UserId', item_key='ProductId'):
    """
    
    """
    num_users = len(set(data[user_key]))
    num_products = len(set(data[item_key]))
    X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(data, n=num_products, d=num_users, user_key=user_key, item_key=item_key)
    Y = sparse_matrix((data["Rating"], (user_ind, item_ind)), shape=(num_users, num_products))
    # Set dense_output to False to ge the sparse represent
    similarity_matrix = cosine_similarity(X, dense_output=False)
    
    user = user_mapper[user_id]
    ratings = np.dot(Y[user], similarity_matrix)
    _, arr = ratings.nonzero()
    res = {}
    for ind in arr:
        if Y[user][0,ind] == 0:
            res[ind] = ratings[0, ind]
            
    result = sorted(res, key=res.get, reverse=True)[:N]
    return result

In [11]:
result1 = IB_CF(df_train, user_id='A3RV5ZUA8W67FK')

In [12]:
for item_ind in result1:
    display(HTML('<a href="%s">%s</a>' % ('https://www.amazon.com/dp/' + item_inverse_mapper[item_ind], 
                                      item_inverse_mapper[item_ind])))

In [13]:
result1

[121081, 123545, 138436, 8723, 194851]

In [14]:
for item_ind in result1:
    print(cosine_similarity(X[item_mapper['B0016P4P4K']], X[item_ind]))

[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]
[[0.0838591]]


## Sanity check

In [15]:
similarity_matrix = cosine_similarity(X, dense_output=False)

In [16]:
_, temp_arr = similarity_matrix[item_mapper['B0016P4P4K']].nonzero()

In [17]:
my_dict = {}
for ind in temp_arr:
    my_dict[ind] = similarity_matrix[item_mapper['B0016P4P4K']][0, ind]
sorted_ind = sorted(my_dict, key=my_dict.get, reverse=True)

In [18]:
sorted_ind[1:6]

[31941, 194851, 8723, 138436, 123545]

In [20]:
for i in result:
    print(item_mapper[i])

31941
194851
8723
138436
123545


### Other approaches

#### Ratings give to the same item by top 5 similar users

In [4]:
# For the training set
if os.path.isfile("../data/clean/train_sparse_UI.npz"):
    train_sparse_UI = sparse.load_npz("../data/clean/train_sparse_UI.npz")
    
else:
    num_users_train = len(set(df_train['UserId']))
    num_products_train = len(set(df_train['ProductId']))
    train_sparse_IU, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind\
    = create_X(df_train, n=num_products_train, d=num_users_train, user_key='UserId', item_key='ProductId')  
    train_sparse_UI = train_sparse_IU.T
    sparse.save_npz("../data/clean/train_sparse_UI.npz", train_sparse_UI)

In [5]:
def get_sample_sparse_matrix(sparseMatrix, n_users, n_items):
    start = datetime.now()
    users, items, ratings = sparse.find(sparseMatrix)
    uniq_users = np.unique(users)
    uniq_items = np.unique(items)
    np.random.seed(15)   #this will give same random number everytime, without replacement
    userS = np.random.choice(uniq_users, n_users, replace = False)
    itemS = np.random.choice(uniq_items, n_items, replace = False)
    mask = np.logical_and(np.isin(users, userS), np.isin(items, itemS))
    sparse_sample = sparse.csr_matrix((ratings[mask], (users[mask], items[mask])), 
                                                     shape = (max(userS)+1, max(itemS)+1))
    print("Sparse Matrix creation done. Saving it for later use.")
    sparse.save_npz(path, sparse_sample)
    print("Done")
    print("Shape of Sparse Sampled Matrix = "+str(sparse_sample.shape))
    
    print(datetime.now() - start)
    return sparse_sample

In [6]:
path = "../data/clean/TrainUISparseData_Sample.npz"
if not os.path.isfile(path):
    print("Sample sparse matrix is not present in the disk. We are creating it...")
    train_sample_sparse = get_sample_sparse_matrix(train_sparse_UI, 100000, 10000)
else:
    print("File is already present in the disk. Loading the file...")
    train_sample_sparse = sparse.load_npz(path)
    print("File loading done.")
    print("Shape of Train Sample Sparse Matrix = "+str(train_sample_sparse.shape))

File is already present in the disk. Loading the file...
File loading done.
Shape of Train Sample Sparse Matrix = (1025570, 226768)


In [7]:
sample_train_users, sample_train_items, sample_train_ratings = sparse.find(train_sample_sparse)

In [8]:
len(sample_train_ratings)

7463

In [9]:
def getAverageRatings(sparseMatrix, if_user):
    ax = 1 if if_user else 0
    #axis = 1 means rows and axis = 0 means columns 
    sumOfRatings = sparseMatrix.sum(axis = ax).A1 
    noOfRatings = (sparseMatrix!=0).sum(axis = ax).A1  
    rows, cols = sparseMatrix.shape
    averageRatings = {i: sumOfRatings[i]/noOfRatings[i] for i in range(rows if if_user else cols) if noOfRatings[i]!=0}
    return averageRatings

In [10]:
average_user_rating = getAverageRatings(train_sparse_UI, True)
average_item_rating = getAverageRatings(train_sparse_UI, False)

In [11]:
if os.path.isfile("../data/clean/Train_Regression.csv"):
    print("File is already present in your disk. You do not have to prepare it again.")
else:
    startTime = datetime.now()
    print("Preparing Train csv file for {} rows".format(len(sample_train_ratings)))
    train_sample_sparse_T = train_sample_sparse.T
    with open("../data/clean/Train_Regression.csv", mode = "w") as data:
        count = 0
        for user, item, rating in zip(sample_train_users, sample_train_items, sample_train_ratings):
            row = list()
            row.append(user)  #appending user index
            row.append(item) #appending item index
            row.append(train_sample_sparse.sum()/train_sample_sparse.count_nonzero()) #appending global average rating

#----------------------------------Ratings given to "item" by top 5 similar users with "user"--------------------#
            similar_users = cosine_similarity(train_sample_sparse[user], train_sample_sparse).ravel()
            similar_users_indices = np.argsort(-similar_users)
            similar_users_indices = similar_users_indices[similar_users_indices != user]
            similar_users_ratings = train_sample_sparse[similar_users_indices, item].toarray().ravel()
            top_similar_user_ratings = list(similar_users_ratings[similar_users_ratings != 0][:5])
            top_similar_user_ratings.extend([average_item_rating[item]]*(5-len(top_similar_user_ratings)))
            #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "item" average
            #rating. Let say only 3 out of 5 ratings are available then rest 2 will be "item" average rating.
            row.extend(top_similar_user_ratings)
            
 #----------------------------------Ratings given by "user" to top 5 similar items with "items"------------------#
            similar_items = cosine_similarity(train_sample_sparse_T[item], train_sample_sparse_T).ravel()
            similar_items_indices = np.argsort(-similar_items)
            similar_items_indices = similar_items_indices[similar_items_indices != item]
            similar_items_ratings = train_sample_sparse[user, similar_items_indices].toarray().ravel()
            top_similar_item_ratings = list(similar_items_ratings[similar_items_ratings != 0][:5])
            top_similar_item_ratings.extend([average_user_rating[user]]*(5-len(top_similar_item_ratings)))
            #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "user" average
            #rating. Let say only 3 out of 5 ratings are available then rest 2 will be "user" average rating.
            row.extend(top_similar_item_ratings)
            
 #----------------------------------Appending "user" average, "item" average & rating of "user""item"-----------#
            row.append(average_user_rating[user])
            row.append(average_item_rating[item])
            row.append(rating)
            
#-----------------------------------Converting rows and appending them as comma separated values to csv file------#
            data.write(",".join(map(str, row)))
            data.write("\n")
    
            count += 1
            if count % 2000 == 0:
                print("Done for {}. Time elapsed: {}".format(count, (datetime.now() - startTime)))
                
    print("Total Time for {} rows = {}".format(len(sample_train_ratings), (datetime.now() - startTime)))

File is already present in your disk. You do not have to prepare it again.


In [12]:
Train_Reg = pd.read_csv("../data/clean/Train_Regression.csv", names = ["User_ind", "Item_ind", "Global_Average", "SUR1", "SUR2", "SUR3", "SUR4", "SUR5", "SIR1", "SIR2", "SIR3", "SIR4", "SIR5", "User_Average", "Item_Average", "Rating"])
Train_Reg.head()

Unnamed: 0,User_ind,Item_ind,Global_Average,SUR1,SUR2,SUR3,SUR4,SUR5,SIR1,SIR2,SIR3,SIR4,SIR5,User_Average,Item_Average,Rating
0,500635,17,4.144044,1.0,3.428571,3.428571,3.428571,3.428571,5.0,4.666667,4.666667,4.666667,4.666667,4.666667,3.428571,5.0
1,993665,17,4.144044,5.0,3.428571,3.428571,3.428571,3.428571,1.0,1.0,1.0,1.0,1.0,1.0,3.428571,1.0
2,83620,83,4.144044,3.8,3.8,3.8,3.8,3.8,4.0,4.0,4.0,4.0,4.0,4.0,3.8,3.0
3,648587,177,4.144044,5.0,5.0,4.565217,4.565217,4.565217,2.0,2.0,2.0,2.0,2.0,2.0,4.565217,2.0
4,715917,177,4.144044,2.0,5.0,4.565217,4.565217,4.565217,4.666667,4.666667,4.666667,4.666667,4.666667,4.666667,4.565217,5.0


In [13]:
print("Number of nan Values: "+str(Train_Reg.isnull().sum().sum()))

Number of nan Values: 0


In [14]:
print("Shape of Train DataFrame: {}".format(Train_Reg.shape))

Shape of Train DataFrame: (7463, 16)


In [15]:
# For the test set
if os.path.isfile("../data/clean/test_sparse_UI.npz"):
    test_sparse_UI = sparse.load_npz("../data/clean/test_sparse_UI.npz")
    
else:
    num_users_test = len(set(df_test['UserId']))
    num_products_test = len(set(df_test['ProductId']))
    test_sparse_IU, user_mapper_test, item_mapper_test, user_inverse_mapper_test, item_inverse_mapper_test, user_ind_test, item_ind_test\
    = create_X(df_test, n=num_products_test, d=num_users_test, user_key='UserId', item_key='ProductId')  
    test_sparse_UI = test_sparse_IU.T
    sparse.save_npz("../data/clean/test_sparse_UI.npz", test_sparse_UI)

In [16]:
path = "../data/clean/TestUISparseData_Sample.npz"
if not os.path.isfile(path):
    print("Sample sparse matrix is not present in the disk. We are creating it...")
    test_sample_sparse = get_sample_sparse_matrix(test_sparse_UI, 50000, 5000)
else:
    print("File is already present in the disk. Loading the file...")
    test_sample_sparse = sparse.load_npz(path)
    print("File loading done.")
    print("Shape of Test Sample Sparse Matrix = "+str(test_sample_sparse.shape))

File is already present in the disk. Loading the file...
File loading done.
Shape of Test Sample Sparse Matrix = (331226, 113835)


In [17]:
sample_test_users, sample_test_items, sample_test_ratings = sparse.find(test_sample_sparse)

In [18]:
len(sample_test_ratings)

2677