In [1]:
import pandas as pd
import numpy as np
import json

import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
with open("data.json") as file:
    df = json.load(file)

In [3]:
# define all the name of each columns
reviewerID = []
productID = []
liked_and_seen = []
reviewText = []
rating = []
summary = []
unixTime = []
date = []


In [4]:
# assign names to each columns
for i in range(len(df)):
    productID.append(df[i]['asin'])
    reviewerID.append(df[i]['reviewerID'])
    reviewText.append(df[i]['reviewText'])
    rating.append(df[i]['overall'])
    summary.append(df[i]['summary'])
    unixTime.append(df[i]['unixReviewTime'])
    liked_and_seen.append(df[i]['helpful'])
    date.append(df[i]['reviewTime'])

In [5]:
df = pd.DataFrame({"user_id": reviewerID,
                    "product_id": productID,
                    "liked_and_seen": liked_and_seen,
                    "review_text": reviewText,
                    "summary": summary,
                    "timestamp": unixTime,
                    "date": date,
                    "ratings": rating})


In [6]:
df.shape

(33620, 8)

In [7]:
df.head(10)

Unnamed: 0,user_id,product_id,liked_and_seen,review_text,summary,timestamp,date,ratings
0,A30TL5EWN6DFXT,120401325X,"[0, 0]",They look good and stick good! I just don't li...,Looks Good,1400630400,"05 21, 2014",4.0
1,ASY55RVNIL0UD,120401325X,"[0, 0]",These stickers work like the review says they ...,Really great product.,1389657600,"01 14, 2014",5.0
2,A2TMXE2AFO7ONB,120401325X,"[0, 0]",These are awesome and make my phone look so st...,LOVE LOVE LOVE,1403740800,"06 26, 2014",5.0
3,AWJ0WZQYMYFQ4,120401325X,"[4, 4]",Item arrived in great time and was in perfect ...,Cute!,1382313600,"10 21, 2013",4.0
4,ATX7CZYFXI1KW,120401325X,"[2, 3]","awesome! stays on, and looks great. can be use...",leopard home button sticker for iphone 4s,1359849600,"02 3, 2013",5.0
5,APX47D16JOP7H,120401325X,"[1, 2]",These make using the home button easy. My daug...,Cute,1381536000,"10 12, 2013",3.0
6,A1JVVYYO7G56DS,120401325X,"[0, 0]",Came just as described.. It doesn't come unstu...,best thing ever..,1377129600,"08 22, 2013",5.0
7,A6FGO4TBZ3QFZ,3998899561,"[1, 2]",it worked for the first week then it only char...,not a good Idea,1384992000,"11 21, 2013",1.0
8,A2JWEDW5FSVB0F,3998899561,"[2, 3]","Good case, solid build. Protects phone all aro...",Solid Case,1380067200,"09 25, 2013",5.0
9,A8AJS1DW7L3JJ,3998899561,"[1, 1]",This is a fantastic case. Very stylish and pro...,Perfect Case,1396483200,"04 3, 2014",5.0


In [8]:
df.columns

Index(['user_id', 'product_id', 'liked_and_seen', 'review_text', 'summary',
       'timestamp', 'date', 'ratings'],
      dtype='object')

In [9]:
# Handling Missing values
print('Number of missing values across columns: \n', df.isnull().sum())

Number of missing values across columns: 
 user_id           0
product_id        0
liked_and_seen    0
review_text       0
summary           0
timestamp         0
date              0
ratings           0
dtype: int64


In [10]:
amazon_df = df.drop(columns = ["liked_and_seen", "timestamp", "summary", "date"])

In [14]:
amazon_df.head(10)

Unnamed: 0,user_id,product_id,review_text,ratings
0,A30TL5EWN6DFXT,120401325X,They look good and stick good! I just don't li...,4.0
1,ASY55RVNIL0UD,120401325X,These stickers work like the review says they ...,5.0
2,A2TMXE2AFO7ONB,120401325X,These are awesome and make my phone look so st...,5.0
3,AWJ0WZQYMYFQ4,120401325X,Item arrived in great time and was in perfect ...,4.0
4,ATX7CZYFXI1KW,120401325X,"awesome! stays on, and looks great. can be use...",5.0
5,APX47D16JOP7H,120401325X,These make using the home button easy. My daug...,3.0
6,A1JVVYYO7G56DS,120401325X,Came just as described.. It doesn't come unstu...,5.0
7,A6FGO4TBZ3QFZ,3998899561,it worked for the first week then it only char...,1.0
8,A2JWEDW5FSVB0F,3998899561,"Good case, solid build. Protects phone all aro...",5.0
9,A8AJS1DW7L3JJ,3998899561,This is a fantastic case. Very stylish and pro...,5.0


In [15]:
counts = amazon_df.user_id.value_counts()
amazon_df_final = amazon_df[amazon_df.user_id.isin(counts[counts >= 1].index)]

In [16]:
amazon_df_final.drop_duplicates()

Unnamed: 0,user_id,product_id,review_text,ratings
0,A30TL5EWN6DFXT,120401325X,They look good and stick good! I just don't li...,4.0
1,ASY55RVNIL0UD,120401325X,These stickers work like the review says they ...,5.0
2,A2TMXE2AFO7ONB,120401325X,These are awesome and make my phone look so st...,5.0
3,AWJ0WZQYMYFQ4,120401325X,Item arrived in great time and was in perfect ...,4.0
4,ATX7CZYFXI1KW,120401325X,"awesome! stays on, and looks great. can be use...",5.0
...,...,...,...,...
33615,A3B1360JOYS0GJ,B004E9TLVM,"Ok, let me mention the only flaw of this phone...",5.0
33616,A32XQF33A9310C,B004E9TLVM,I occasionally travel internationally for work...,3.0
33617,A3OC93X866Z8TB,B004E9TLVM,"I like the features of this phone, and I like ...",4.0
33618,A122C77O6TYMLP,B004E9TLVM,Aside from the aspects of this device that I w...,3.0


In [17]:
#constructing the pivot table
final_ratings_matrix = amazon_df_final.reset_index().pivot_table(index = "user_id",
                                                                 columns = "product_id",
                                                                 values = "ratings",
                                                                 aggfunc = "mean").fillna(0)

In [18]:
final_ratings_matrix.head(10)

product_id,120401325X,3998899561,6073894996,7532385086,7887421268,8199406933,8288853439,8288855504,8288862993,8288878881,...,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9T01S,B004E9TLVM
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01335502AS0LP3ASV1Z7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01623621DS97QCLQANL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A01852072Z7B68UHLI5UG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A025531519P5C6VNP05FT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0260105268IXGY2H11N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A027168223K80PQU4IP35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05463141Q2NQ2L1TYQGE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05793033A673QTUT56XZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A059899119L7CHCKBYQRD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A06170512LLXAS5ND7UQB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Building the Popular Recommendation System

In [22]:
#Calucating the density of the ratings matrix
num_of_ratings = np.count_nonzero(final_ratings_matrix)
print("num_of_ratings = ", num_of_ratings)
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print("possible_num_of_ratings = ", possible_num_of_ratings)
density = (num_of_ratings/possible_num_of_ratings)
density *= 100
print ("density: {:4.2f}%".format(density))

num_of_ratings =  33620
possible_num_of_ratings =  30932712
density: 0.11%


In [23]:
final_ratings_matrix_T = final_ratings_matrix.transpose()

In [24]:
final_ratings_matrix_T

user_id,A01335502AS0LP3ASV1Z7,A01623621DS97QCLQANL3,A01852072Z7B68UHLI5UG,A025531519P5C6VNP05FT,A0260105268IXGY2H11N2,A027168223K80PQU4IP35,A05463141Q2NQ2L1TYQGE,A05793033A673QTUT56XZ,A059899119L7CHCKBYQRD,A06170512LLXAS5ND7UQB,...,AZV9TQDZU7306,AZVEABJEV1TQN,AZVQ7TVYS5ZVU,AZVVD2TRCFHRL,AZW6WE7UXAMU0,AZWQEM8GKXQ5Y,AZWVLJM3958OD,AZX2RDN9YXZAE,AZXQ9HFT8OQZ9,AZZYW4YOE1B6E
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
120401325X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998899561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6073894996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7532385086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7887421268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B004E75R3K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B004E7EKV0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B004E9SZP0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B004E9T01S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#Count of user_id for each unique product as recommendation score 
amazon_df_final_grouped = amazon_df_final.groupby("product_id").agg({"user_id": "count"}).reset_index()
amazon_df_final_grouped.rename(columns = {"user_id": "score"}, inplace=True)
amazon_df_final_grouped.head(10)

Unnamed: 0,product_id,score
0,120401325X,7
1,3998899561,10
2,6073894996,37
3,7532385086,9
4,7887421268,13
5,8199406933,5
6,8288853439,10
7,8288855504,6
8,8288862993,25
9,8288878881,24


In [27]:
#Sort the products on recommendation score 
amazon_df_final_grouped_sort = amazon_df_final_grouped.sort_values(['score', 'product_id'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
amazon_df_final_grouped_sort['rank'] = amazon_df_final_grouped_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = amazon_df_final_grouped_sort.head(5) 
popularity_recommendations

Unnamed: 0,product_id,score,rank
1576,B0042FV2SI,694,1.0
249,B000S5Q9CA,628,2.0
342,B0015RB39O,466,3.0
1674,B0044WTQVE,360,4.0
1443,B003ZBZ64Q,339,5.0


In [29]:
def recommend(id):     
    recommend_products = popularity_recommendations 
    recommend_products["user_id"] = id 
    column = recommend_products.columns.tolist() 
    column = column[-1:] + column[:-1] 
    recommend_products = recommend_products[column] 
    return recommend_products 

print(recommend(9))

      user_id  product_id  score  rank
1576        9  B0042FV2SI    694   1.0
249         9  B000S5Q9CA    628   2.0
342         9  B0015RB39O    466   3.0
1674        9  B0044WTQVE    360   4.0
1443        9  B003ZBZ64Q    339   5.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommend_products["user_id"] = id


#### User-based collaborative filtering

We will build the recommender systems using matrix factorization and singular value decomposition.

In [26]:
# shape of the matrix
final_ratings_matrix.shape

(16136, 1917)

In [27]:
# define user index and display the first elements
final_ratings_matrix['user_id_index'] = np.arange(0, final_ratings_matrix.shape[0], 1)
final_ratings_matrix.head()

product_id,120401325X,3998899561,6073894996,7532385086,7887421268,8199406933,8288853439,8288855504,8288862993,8288878881,...,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9T01S,B004E9TLVM,user_id_index
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01335502AS0LP3ASV1Z7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A01623621DS97QCLQANL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
A01852072Z7B68UHLI5UG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
A025531519P5C6VNP05FT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
A0260105268IXGY2H11N2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [28]:
# actual ratings given by user
final_ratings_matrix.set_index(['user_id_index'], inplace = True)
final_ratings_matrix.head()

product_id,120401325X,3998899561,6073894996,7532385086,7887421268,8199406933,8288853439,8288855504,8288862993,8288878881,...,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9T01S,B004E9TLVM
user_id_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We observe that the user item matrix is a sparse matrix due to the zero values. To solve this issue, we use the singular value decomposition method (SVD) which leads to small but non-zero singular values in a rank deficient matrix.

In [30]:
# singular value decomposition
U, sigma, Vt = svds(final_ratings_matrix, k = 9)
print('Left singular matrix: \n', U)

Left singular matrix: 
 [[ 1.53164301e-04 -7.62914240e-04  7.13931387e-04 ...  2.57795604e-03
   4.38134181e-02 -6.97342977e-03]
 [ 3.38611787e-03  1.57749625e-03 -3.28176795e-06 ... -9.27559322e-05
   4.94021579e-04 -4.88141709e-04]
 [ 7.33219323e-06 -2.76876808e-07  7.44097930e-07 ...  5.20990707e-07
   1.06505970e-05 -1.80167890e-06]
 ...
 [ 1.87152456e-04  6.69135945e-05  1.98348815e-05 ... -1.78063101e-05
   2.03767491e-05 -2.93950971e-05]
 [ 1.18838085e-03 -5.65266588e-04 -5.51494193e-03 ... -4.92438436e-03
   2.85881436e-05 -3.39822352e-04]
 [ 7.91197238e-04  7.53859691e-04  3.00690643e-04 ... -2.66072127e-04
   7.88334267e-06 -1.14710939e-04]]


In [31]:
print('Sigma: \n', sigma)

Sigma: 
 [ 59.80661748  71.82088318  75.66793508  76.87595699  82.70238809
  86.60127294  93.76851921 112.39311469 116.39669733]


In [32]:
# convert sigma into a diagonal matrix
sigma = np.diag(sigma)
print('Diagonal matrix: \n', sigma)

Diagonal matrix: 
 [[ 59.80661748   0.           0.           0.           0.
    0.           0.           0.           0.        ]
 [  0.          71.82088318   0.           0.           0.
    0.           0.           0.           0.        ]
 [  0.           0.          75.66793508   0.           0.
    0.           0.           0.           0.        ]
 [  0.           0.           0.          76.87595699   0.
    0.           0.           0.           0.        ]
 [  0.           0.           0.           0.          82.70238809
    0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
   86.60127294   0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.          93.76851921   0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.         112.39311469   0.        ]
 [  0.           0.           0.           0.           0.
  

In [33]:
print('Right sigular matrix: \n', Vt)

Right sigular matrix: 
 [[ 3.41217032e-04  1.34484762e-03  1.82580625e-03 ...  2.42309343e-04
   3.18654955e-04  4.19295639e-03]
 [-7.88316933e-05  1.08091775e-05 -1.04353668e-03 ... -7.11921582e-05
   2.04753523e-05  5.78694426e-05]
 [-3.63243463e-05  9.87766357e-06 -8.52779075e-04 ...  2.66564837e-03
  -1.83969160e-05  1.05568564e-03]
 ...
 [ 9.37748354e-05 -9.91351651e-06 -7.39612267e-04 ... -2.33926228e-03
  -1.29780933e-05 -9.62952330e-04]
 [ 2.10804867e-03  1.55021163e-07  3.50203935e-03 ... -2.41625810e-05
   4.26081079e-07  1.06196511e-05]
 [-3.36712104e-04 -9.29908937e-06 -8.34601385e-04 ... -4.25486841e-04
  -1.17981034e-05 -7.27983748e-05]]


In [34]:
# predict new ratings using sigma
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [35]:
# convert predicted scores to a dataframe
preds_ratings_df = pd.DataFrame(predicted_ratings, columns = final_ratings_matrix.columns)
preds_ratings_df.head()

product_id,120401325X,3998899561,6073894996,7532385086,7887421268,8199406933,8288853439,8288855504,8288862993,8288878881,...,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9T01S,B004E9TLVM
0,0.010234,7.153324e-06,0.006363975,-0.000118919,0.0001195782,-0.0009413541,-0.0001548411,1.475609e-05,0.000239537,-1.440216e-05,...,-0.001213611,6.70111e-05,0.011817,-0.0002261622,0.016319,1.571355e-05,3.948043e-06,-1.770209e-05,-9.519966e-06,2.4e-05
1,0.000163,0.0002757252,0.001066438,8.858701e-05,0.000243069,0.0002553284,6.115802e-05,9.846929e-06,0.0002314424,5.198771e-05,...,0.0001408981,7.119358e-05,0.000927,0.0002802827,0.000301,4.795643e-07,7.064295e-06,6.094455e-05,7.053752e-05,0.000862
2,2e-06,5.886975e-07,4.618255e-07,2.483072e-08,4.71999e-07,-3.348118e-07,-1.236901e-07,1.74551e-07,2.035789e-07,4.527441e-08,...,-8.925298e-07,9.646589e-08,4e-06,1.359804e-07,4e-06,4.191654e-09,1.484235e-08,2.482385e-07,1.36243e-07,2e-06
3,0.000125,5.872502e-05,0.000400738,1.321932e-05,4.87364e-05,2.947917e-05,-1.92783e-05,-1.560155e-06,3.712868e-06,5.266079e-06,...,7.99472e-05,1.516111e-05,0.000254,3.811636e-05,0.000167,2.685815e-07,1.461306e-06,6.782813e-06,1.450334e-05,0.000182
4,1e-06,8.48808e-06,6.38551e-05,6.112019e-07,8.748543e-06,6.144474e-06,3.132168e-05,-1.557847e-07,6.326763e-05,6.50448e-06,...,1.645869e-06,2.284703e-06,2.5e-05,4.875476e-05,4e-06,9.356674e-09,2.472654e-07,3.838313e-06,2.154529e-06,2.6e-05


In [36]:
# recommend the item with the highest predicted ratings
def recommend_item (user_id, final_ratings_matrix, preds_ratings_df, num_recommendations):
    # index starts at 0
    useridx = user_id-1
    # get and sort the user's ratings
    sort_user_rating = final_ratings_matrix.iloc[useridx].sort_values(ascending = False)
    # sorted user rating
    sort_user_prediction = preds_ratings_df.iloc[useridx].sort_values(ascending = False)
     #sort reviewer_predictions
    temp = pd.concat([sort_user_rating, sort_user_prediction], axis = 1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_prediction']
    temp = temp.loc[temp.user_ratings == 0]
    temp = temp.sort_values('user_prediction', ascending = False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'. format(user_id))
    print(temp.head(num_recommendations))

In [37]:
user_id = 9
num_recommendations = 10
recommend_item(user_id, final_ratings_matrix, preds_ratings_df, num_recommendations)


Below are the recommended items for user(user_id = 9):

                   user_ratings  user_prediction
Recommended Items                               
B0042FV2SI                  0.0         0.106485
B0015RB39O                  0.0         0.046739
B002BH3I9U                  0.0         0.031308
B0044WTQVE                  0.0         0.016547
B001XXUOQI                  0.0         0.015981
B003UC93WQ                  0.0         0.015172
B0035R2QS4                  0.0         0.011967
B003X6LPRK                  0.0         0.010180
B003TW77KC                  0.0         0.009756
B003TM5AJM                  0.0         0.007362


In [38]:
user_id = 56
num_recommendations = 10
recommend_item(user_id, final_ratings_matrix, preds_ratings_df, num_recommendations)


Below are the recommended items for user(user_id = 56):

                   user_ratings  user_prediction
Recommended Items                               
B0041ST5L2                  0.0         0.019099
B002BH3I9U                  0.0         0.010340
B000S5Q9CA                  0.0         0.009445
B001XXUOQI                  0.0         0.008899
B003UC93WQ                  0.0         0.003735
B0035R2QS4                  0.0         0.003366
B0009B0IX4                  0.0         0.002018
B00404UR3C                  0.0         0.001793
B003X6LPRK                  0.0         0.001732
B003ELOOZO                  0.0         0.001181


#### Evaluation of our recommendation collaborative filtering engine

In [39]:
# average actual rating per item
final_ratings_matrix.mean().head(10)

product_id
120401325X    0.001921
3998899561    0.002851
6073894996    0.008924
7532385086    0.002479
7887421268    0.003223
8199406933    0.001487
8288853439    0.002665
8288855504    0.001735
8288862993    0.006011
8288878881    0.004896
dtype: float64

In [40]:
# predicted ratings per item
preds_ratings_df.mean().head(10)

product_id
120401325X    0.000501
3998899561    0.000134
6073894996    0.002551
7532385086    0.000098
7887421268    0.000170
8199406933    0.000583
8288853439    0.000579
8288855504    0.000318
8288862993    0.000821
8288878881    0.000112
dtype: float64

In [41]:
rmse_df = pd.concat([final_ratings_matrix.mean(), preds_ratings_df.mean()], axis = 1)
rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
rmse_df['product_id_index'] = np.arange(0, rmse_df.shape[0], 1)
rmse_df.head(10)

Unnamed: 0_level_0,Avg_actual_ratings,Avg_predicted_ratings,product_id_index
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
120401325X,0.001921,0.000501,0
3998899561,0.002851,0.000134,1
6073894996,0.008924,0.002551,2
7532385086,0.002479,9.8e-05,3
7887421268,0.003223,0.00017,4
8199406933,0.001487,0.000583,5
8288853439,0.002665,0.000579,6
8288855504,0.001735,0.000318,7
8288862993,0.006011,0.000821,8
8288878881,0.004896,0.000112,9


In [42]:
mse = mean_squared_error(rmse_df.Avg_actual_ratings, rmse_df.Avg_predicted_ratings)
mse

1.9371321521136956e-05

In [43]:
RMSE = sqrt(mse)
print('RMSE: %f' % RMSE)

RMSE: 0.004401


In [None]:
# Source Code:
# https://thecleverprogrammer.com/2021/03/23/amazon-recommendation-system-using-python/
# https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html
# https://towardsdatascience.com/using-cosine-similarity-to-build-a-movie-recommendation-system-ae7f20842599
# https://github.com/VaibhavAbhimanyooHiwase/Sentimental_Analysis_using_Opinion_Target_and_Opinion_Words/blob/master/sentiment%20analysis%20based%20on%20opinion%20target%20and%20opinion%20word%20mining.py
# https://github.com/Kavitha-Kothandaraman/Product-Recommendation-Systems/blob/master/Product_Recommendation_Systems.ipynb
# https://github.com/LaxmiChaudhary/Amzon-Product-Recommendation/blob/master/Recommendation%20System.ipynb