# Item Based Collaborative Filtering

In [1]:
%reset -f
# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# load data - WINDOWS
# amz_data = pd.read_csv(r'c:\Users\e1002902\Downloads\amz_data.csv')
# display(amz_data.head())

# load data - MAC OS
amz_data = pd.read_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/set2_data_modelling.csv')
display(amz_data.head(3))

# print details
print('Number of Rows: ', amz_data.shape[0])
print('Number of Columns: ', amz_data.shape[1])
print('Number of Unique Users: ', len(amz_data['reviewerID'].unique()))
print('Number of Unique Products: ', len(amz_data['asin'].unique()))
print('Fewest reviews by a reviewer:', amz_data.groupby('reviewerID')['asin'].count().min())
print('Most reviews by a reviewer:', amz_data.groupby('reviewerID')['asin'].count().max())
print("Fewest reviews per product:", amz_data.groupby('asin')['reviewerID'].count().min())
print("Most reviews per product:", amz_data.groupby('asin')['reviewerID'].count().max())


# Creating User Item Matrix =====================================================
# create user-item matrix
x = amz_data.pivot_table(index='reviewerID', columns='asin', values='overall')
x = x.fillna(0)
print("\n\n\nUser-Item Matrix")
display(x.head())

Unnamed: 0,reviewerID,reviewTime,asin,overall,reviewText,stemmed_words_revText,lemmatised_reviewText,filtered_tokens_revText,sentiments_afinn,sentiments_bing,sentiments_vader
0,A14638TGYH7GD9,2010-10-28,321719816,5.0,even though i use dreamweaver a great deal and...,even though use dreamweav great deal sever boo...,even though use dreamweaver great deal several...,even though use dreamweaver great deal several...,20,11,0.99
1,A2JMJVNTBL7K7E,2011-04-07,321719816,5.0,i spent several hours on the lesson and i love...,spent sever hour lesson love detail clear inst...,spent several hour lesson love detailed clear ...,spent several hours lesson love detailed clear...,19,8,0.9766
2,A2BVNVJOFXGZUB,2010-09-26,321719816,5.0,the video is wellpaced and delivered in an und...,video wellpac deliv understand manner allow wo...,video wellpaced delivered understandable manne...,video wellpaced delivered understandable manne...,3,3,0.4939


Number of Rows:  294255
Number of Columns:  11
Number of Unique Users:  13436
Number of Unique Products:  11751
Fewest reviews by a reviewer: 12
Most reviews by a reviewer: 368
Fewest reviews per product: 12
Most reviews per product: 374



User-Item Matrix


asin,0321719816,0439499887,0763855553,076780192X,0767824571,0767827759,0767834739,0768881714,0782010792,0783239408,...,B01HD8OXO0,B01HD8OYSK,B01HDW58I6,B01HE0W2WC,B01HGBAFNC,B01HGD8OYM,B01HGSJPMW,B01HHVVLGQ,B01HHVWWMI,B01HIZF7XE
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0380485C177Q6QQNJIX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0685888WB02Q69S553P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0743345UFTOA4V1Z7W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1004703RC79J9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100JCBNALJFAW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
# create a copy of the original matrix to store hidden ratings
x_hidden = x.copy()
indices_tracker = []

# number of products to hide for each user
N = 2

# identifies rated items and randomly selects N products to hide ratings for each user
np.random.seed(10)  # You can use any integer value as the seed
for user_id in range(x_hidden.shape[0]):
    rated_products = np.where(x_hidden.iloc[user_id, :] > 0)[0]
    print("User:", user_id)
    print("Indices of Rated Products:", rated_products)
    hidden_indices = np.random.choice(rated_products, N, replace=False)
    indices_tracker.append(hidden_indices)
    print("Indices to Hide:", hidden_indices, "\n")
    x_hidden.iloc[user_id, hidden_indices] = 0


User: 0
Indices of Rated Products: [2004 4947 4952 4959 5010 5438 5442 5470 5567 6090 6094 6950]
Indices to Hide: [4952 5442] 

User: 1
Indices of Rated Products: [1304 2002 2004 2010 4156 4699 4847 4978 4985 5842 6310 6824 6851 9874
 9915]
Indices to Hide: [4699 2002] 

User: 2
Indices of Rated Products: [ 2828  5037  5429  6323  6324  6646  6900  6935  7078  7721  7769  8092
  8556  8648  8684  8687  9538 10134 10260 10263 10381]
Indices to Hide: [7769 8092] 

User: 3
Indices of Rated Products: [  82  379 1572 1621 1855 1950 2014 2016 2017 2032 2042 2409 2530 2698
 2800 2978 3198 3572]
Indices to Hide: [2698 1621] 

User: 4
Indices of Rated Products: [ 3847  4515  4579  5898  6317  6582  7019  7038  7090  7111  7127  7131
  7138  7155  7157  7188  7272  7430  7432  7567  7776  7863  7888  8071
  8098  8110  8128  8167  8168  8169  8257  8259  8293  8319  8367  8368
  8369  8430  8465  8653  8771  8977  9004  9353  9414  9442  9524  9597
 10080]
Indices to Hide: [7776 7272] 

User: 5


In [8]:
# check tracker - all hidden ratings 
indices_tracker = pd.DataFrame(indices_tracker).to_numpy()
print("Indices of Ratings per user \n", indices_tracker)

# flattened
indices_tracker_flat = indices_tracker.flatten()
print("Indices of Ratings per User joined", indices_tracker_flat)


Indices of Ratings per user 
 [[ 2431  8023]
 [11218  4122]
 [ 5937  3319]
 [ 5556  7998]
 [  387  5643]
 [ 5364  8622]
 [ 3890  9387]
 [ 2056 10915]
 [10329  3570]
 [ 7672  9486]
 [ 6071  8733]
 [ 6738  3727]
 [ 6194  6376]
 [ 7916  9708]
 [ 2169  2136]
 [  864  3882]
 [ 1423  7518]
 [ 3994  7554]
 [ 5086  4258]
 [ 4095  4547]
 [ 1389  5634]
 [ 8052  3632]
 [ 6659  2966]
 [ 5076  4819]
 [ 2483  2566]
 [11474  6721]
 [ 9639  6316]
 [10894  3823]
 [ 5598  5737]
 [ 7722  4258]
 [ 1813  6811]
 [ 9657  7175]
 [ 7244 10567]
 [10243  1785]
 [ 5185  6562]
 [ 6716  1040]
 [ 4268  4216]
 [ 6513  9376]
 [ 3965  9451]
 [ 4668  5134]
 [ 8775  9194]
 [ 6895 10488]
 [ 5582  3300]
 [ 8796 10152]
 [ 3533  2502]
 [ 8035  9250]
 [ 4908  6801]
 [ 1491  2433]
 [  181  1279]
 [ 3240  3740]
 [ 1283  6518]
 [ 3245   402]
 [ 3821 11327]
 [ 2465  5504]
 [ 6836  9827]
 [11150  4998]
 [ 2779  2402]
 [ 7182 10136]
 [ 8826 10901]
 [ 1671  1302]
 [ 3214  1222]
 [10074 10383]
 [ 3069 11253]
 [  974  3572]
 [ 6120  5

In [9]:
# see updated matrix with hidden ratings
print("Updated Matrix with Hidden Ratings")
display(x_hidden)

# see original matrix
print("Original Matrix")
display(x)

Updated Matrix with Hidden Ratings


asin,0740782282,0767802799,0767805712,0767809254,0767819462,0767826728,0767827759,0780626699,0782010040,0782010792,...,B01HG1LA6S,B01HG36N0Y,B01HH79XRE,B01HHGAIHE,B01HHVWDG8,B01HHVZRRA,B01HHW0LSY,B01HI776Y0,B01HIPMSAY,B01HIWLIBM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10ZBR6O8S8OCY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
A119Q9NFGVOEJZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A11OTLEDSW8ZXD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A129YBX5BVNW2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AXUJFOFQZNTN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AY1EF0GOH80EK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZSN1TO0JI87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZV26LP92E6WU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Original Matrix


asin,0740782282,0767802799,0767805712,0767809254,0767819462,0767826728,0767827759,0780626699,0782010040,0782010792,...,B01HG1LA6S,B01HG36N0Y,B01HH79XRE,B01HHGAIHE,B01HHVWDG8,B01HHVZRRA,B01HHW0LSY,B01HI776Y0,B01HIPMSAY,B01HIWLIBM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10ZBR6O8S8OCY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
A119Q9NFGVOEJZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A11OTLEDSW8ZXD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A129YBX5BVNW2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AXUJFOFQZNTN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AY1EF0GOH80EK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZSN1TO0JI87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZV26LP92E6WU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# get cosine sim matrix and change to pd dataframe and save to csv
sim_mat_cos = cosine_similarity(x_hidden.T).round(5)
print("Cosine Similarity Matrix") 
sim_mat_cos

Cosine Similarity Matrix


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [18]:
# get a predictions matrix
predic_matrix = x_hidden.copy()

# get predicted ratings for unread books for user 1 using cosine similarity
user_ratings = predic_matrix.iloc[0, :].values.reshape(1, -1)
unrated_products_indices = np.where(user_ratings == 0)[1]
rated_products_indices = np.where(user_ratings > 0)[1]

for product_id in unrated_products_indices:
    similarity_i_j = sim_mat_cos[product_id, rated_products_indices]
    ratings = user_ratings[0, rated_products_indices]
    predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))
    predic_matrix.iloc[0, product_id] = predicted_rating.round(2)

# see updated matrix with predicted ratings
print("Predicted Ratings for User 1")
display(predic_matrix)

# save to csv
predic_matrix.to_csv(r"C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\temp_data_predic_matrix_cosine.csv")


  predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))


Predicted Ratings for User 1


asin,0740782282,0767802799,0767805712,0767809254,0767819462,0767826728,0767827759,0780626699,0782010040,0782010792,...,B01HG1LA6S,B01HG36N0Y,B01HH79XRE,B01HHGAIHE,B01HHVWDG8,B01HHVZRRA,B01HHW0LSY,B01HI776Y0,B01HIPMSAY,B01HIWLIBM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100WO06OQR8BQ,,,,,,,,,,,...,5.0,,,5.0,,,,,,
A10ZBR6O8S8OCY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
A119Q9NFGVOEJZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A11OTLEDSW8ZXD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A129YBX5BVNW2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AXUJFOFQZNTN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AY1EF0GOH80EK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZSN1TO0JI87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZV26LP92E6WU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# get a predictions matrix
predic_matrix = x_hidden.copy()

# now get predicted ratings for all users
for user_id in range(predic_matrix.shape[0]):
    user_ratings = predic_matrix.iloc[user_id, :].values.reshape(1, -1)
    unrated_products_indices = np.where(user_ratings == 0)[1]
    rated_products_indices = np.where(user_ratings > 0)[1]
    for product_id in unrated_products_indices:
        similarity_i_j = sim_mat_cos[product_id, rated_products_indices]
        ratings = user_ratings[0, rated_products_indices]
        
        if np.any(similarity_i_j):
            predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))
        else:
            # make predicted rating mean of user's ratings
            predicted_rating = np.mean(ratings)
        
        predic_matrix.iloc[user_id, product_id] = predicted_rating.round(2)

# see updated matrix with predicted ratings
print("Predicted Ratings for All Users")
display(predic_matrix)

Predicted Ratings for All Users


asin,0740782282,0767802799,0767805712,0767809254,0767819462,0767826728,0767827759,0780626699,0782010040,0782010792,...,B01HG1LA6S,B01HG36N0Y,B01HH79XRE,B01HHGAIHE,B01HHVWDG8,B01HHVZRRA,B01HHW0LSY,B01HI776Y0,B01HIPMSAY,B01HIWLIBM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100WO06OQR8BQ,4.03,4.03,4.03,4.03,4.03,4.03,4.03,4.03,4.03,4.03,...,5.00,4.03,4.03,5.00,4.03,4.03,4.03,4.03,4.03,4.03
A10ZBR6O8S8OCY,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,...,4.00,4.40,4.40,5.00,4.40,4.40,4.40,4.40,4.40,4.40
A119Q9NFGVOEJZ,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,...,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00,5.00
A11OTLEDSW8ZXD,4.35,4.35,4.35,4.35,4.35,4.35,4.35,4.35,4.35,4.35,...,4.35,4.35,4.35,4.35,4.35,4.35,4.35,4.35,4.35,4.35
A129YBX5BVNW2,4.59,4.59,4.59,4.59,4.59,4.59,4.59,4.59,4.59,4.59,...,4.59,4.59,4.59,4.59,4.59,4.59,4.59,4.59,4.59,4.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AXUJFOFQZNTN,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,...,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40
AY1EF0GOH80EK,4.09,4.09,4.09,4.09,4.09,4.09,4.09,4.09,4.09,4.09,...,4.09,4.09,4.09,4.09,4.09,4.09,4.09,4.09,4.09,4.09
AZSN1TO0JI87B,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,...,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40,4.40
AZV26LP92E6WU,4.55,4.55,4.55,4.55,4.55,4.55,4.55,4.55,4.55,4.55,...,4.55,4.55,4.55,4.55,4.55,4.55,4.55,4.55,4.55,4.55


In [13]:
# now evaluate how good the predictions are vs the hidden ratings
# step 1: identify the hidden ratings indices
# step 2: extract hidden ratings indices and corresponding predicted ratings indices
# step 3: calculate MAE, MSE and RMSE (take the hidden ratings as the true values and the predicted ratings as the predicted values)
# step 4:  binarise to get classification metrics

# step 1: identify the hidden ratings indices = indices_tracker and get the hidden ratings ==========================================================================
hidden_ratings_ind = indices_tracker.copy()

# Loop through users to append hidden ratings
hidden_ratings_arrays = []

# Loop through users to append hidden ratings arrays
for user in range(x.shape[0]):
    user_hidden_ratings = x.iloc[user, hidden_ratings_ind[user, :]].reset_index(drop=True).values
    hidden_ratings_arrays.append(user_hidden_ratings)


hidden_ratings_array = pd.DataFrame(hidden_ratings_arrays).to_numpy().flatten()
print("Hidden Ratings:", hidden_ratings_array)


Hidden Ratings: [3. 5. 4. 4. 5. 5. 5. 5. 4. 5. 5. 5. 5. 5. 4. 5. 5. 4. 4. 3. 4. 5. 3. 3.
 4. 4. 3. 5. 5. 5. 3. 5. 4. 4. 5. 5. 5. 5. 5. 5. 5. 4. 5. 4. 5. 4. 5. 5.
 5. 5. 4. 3. 4. 4. 5. 4. 4. 4. 5. 5. 4. 3. 5. 3. 5. 5. 5. 4. 5. 5. 5. 5.
 5. 4. 1. 3. 5. 3. 3. 5. 2. 4. 5. 4. 5. 5. 5. 5. 5. 5. 4. 5. 4. 4. 5. 5.
 4. 5. 5. 4. 4. 3. 5. 4. 2. 4. 5. 5. 5. 5. 5. 5. 5. 5. 5. 4. 2. 4. 5. 5.
 5. 5. 5. 5. 5. 5. 5. 4. 4. 4. 5. 5. 5. 5. 4. 4. 5. 5. 3. 4. 3. 5. 5. 5.
 4. 5. 3. 3. 4. 5. 5. 5. 4. 4. 4. 5. 4. 5. 4. 5. 5. 1. 5. 5. 5. 2. 3. 4.
 4. 4. 4. 4. 5. 4. 4. 5. 4. 4. 4. 5. 3. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.
 4. 5. 5. 5. 4. 1. 5. 5. 3. 5. 5. 5. 5. 3. 5. 5. 5. 5. 4. 3. 3. 2. 5. 4.
 2. 1. 3. 5. 4. 5. 5. 5. 5. 5. 4. 5. 2. 5. 5. 4. 5. 5. 5. 5. 5. 5. 5. 4.
 5. 4. 5. 4. 5. 5. 5. 4. 4. 4. 1. 5. 5. 4. 5. 5. 4. 5. 3. 2. 5. 5. 5. 5.
 4. 5. 2. 5. 5. 4. 3. 3. 4. 4. 3. 5. 5. 5. 5. 3. 5. 5. 3. 5. 3. 5. 4. 4.
 4. 4. 4. 4. 4. 5. 5. 4. 4. 4. 5. 5. 3. 3. 5. 5. 3. 4. 4. 5. 3. 5. 3. 5.
 4. 5. 5. 5. 5. 4. 4. 4. 5. 5. 5. 5

In [14]:
# step 2: extract corresponding predicted ratings indices ==========================================================================

# Create an empty list to store predicted ratings arrays
predicted_ratings_arrays = []

# Loop through users to append predicted ratings arrays
for user in range(predic_matrix.shape[0]):
    user_predicted_ratings = predic_matrix.iloc[user, hidden_ratings_ind[user, :]].reset_index(drop=True).values
    predicted_ratings_arrays.append(user_predicted_ratings)

predicted_ratings_array = pd.DataFrame(predicted_ratings_arrays).to_numpy().flatten()
print("Corresponding Predicted Ratings:", predicted_ratings_array)


Corresponding Predicted Ratings: [5.   4.03 4.4  4.4  5.   5.   4.35 4.35 4.59 4.59 5.   5.   4.83 4.83
 4.5  5.   3.79 3.79 3.9  3.9  4.89 4.89 4.26 4.26 4.03 4.03 4.37 4.37
 4.26 4.26 4.95 4.95 4.52 4.52 5.   5.   4.45 4.45 4.9  4.9  4.91 4.91
 4.53 4.53 4.72 4.72 4.3  4.3  4.85 4.85 3.89 3.89 4.17 4.17 4.15 4.15
 4.   4.   4.42 4.42 2.94 2.94 3.7  3.7  4.5  4.5  4.21 4.21 4.76 4.76
 4.28 4.28 4.07 4.07 4.05 4.05 4.59 4.59 3.95 3.95 4.66 4.66 4.17 4.17
 4.   4.   4.68 4.68 4.61 4.61 4.52 4.52 3.72 3.72 4.81 4.81 4.28 4.28
 4.24 4.24 4.11 4.11 4.07 4.07 4.71 4.71 4.55 4.55 4.8  4.8  4.87 4.87
 4.89 4.89 4.5  4.5  4.4  4.4  4.16 4.16 4.72 4.72 4.38 4.38 4.67 4.67
 4.37 4.37 4.   3.85 5.   5.   4.37 4.37 4.18 4.18 4.42 4.42 3.72 3.72
 4.59 4.59 4.92 4.92 4.15 4.15 4.83 4.83 4.63 4.63 4.39 4.39 4.06 4.06
 4.62 4.62 4.41 4.41 4.52 4.52 4.56 4.56 4.83 4.83 4.12 4.12 3.8  3.8
 3.89 3.89 4.12 4.12 4.44 4.44 4.39 4.39 4.22 4.22 4.21 4.21 4.46 4.46
 4.6  4.6  4.57 4.57 4.81 5.   4.59 4.   4.7 

In [15]:
# step 3: calculate MAE, MSE and RMSE (take the hidden ratings as the true values and the predicted ratings as the predicted values) ==========================================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error

# calculate MAE, MSE and RMSE
print("Using sklearn")
mae = mean_absolute_error(hidden_ratings_array, predicted_ratings_array)
mse = mean_squared_error(hidden_ratings_array, predicted_ratings_array)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


# Manually
print("\n\nManually")
mae = np.mean(np.abs(hidden_ratings_array - predicted_ratings_array)) # Calculate Mean Absolute Error (MAE)
mse = np.mean((hidden_ratings_array - predicted_ratings_array) ** 2) # Calculate Mean Squared Error (MSE)
rmse = np.sqrt(mse) # Calculate Root Mean Squared Error (RMSE)


print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Using sklearn
Mean Absolute Error (MAE): 0.6142643391521196
Mean Squared Error (MSE): 0.7080653366583542
Root Mean Squared Error (RMSE): 0.8414661827182089


Manually
Mean Absolute Error (MAE): 0.6142643391521196
Mean Squared Error (MSE): 0.7080653366583542
Root Mean Squared Error (RMSE): 0.8414661827182089


In [16]:
# step 4: calculate Classification Metrics (take the hidden ratings and the predicted ratings and binarise them) ==========================================================================

# Binarise the hidden ratings and predicted ratings
threshold = 3.5
binary_prediction_ratings = (predicted_ratings_array >= threshold).astype(int) 
print(f"If predicted rating is greater than or equal to {threshold}, then 1, else 0\n")
print("Predicted Ratings:", predicted_ratings_array)
print("Binary Predictions:", binary_prediction_ratings)
binary_hidden_ratings = (hidden_ratings_array >= threshold).astype(int)
print("\n")

print("Hidden Ratings:", hidden_ratings_array)
print("Binary Hidden Ratings:", binary_hidden_ratings)

If predicted rating is greater than or equal to 3.5, then 1, else 0

Predicted Ratings: [5.   4.03 4.4  4.4  5.   5.   4.35 4.35 4.59 4.59 5.   5.   4.83 4.83
 4.5  5.   3.79 3.79 3.9  3.9  4.89 4.89 4.26 4.26 4.03 4.03 4.37 4.37
 4.26 4.26 4.95 4.95 4.52 4.52 5.   5.   4.45 4.45 4.9  4.9  4.91 4.91
 4.53 4.53 4.72 4.72 4.3  4.3  4.85 4.85 3.89 3.89 4.17 4.17 4.15 4.15
 4.   4.   4.42 4.42 2.94 2.94 3.7  3.7  4.5  4.5  4.21 4.21 4.76 4.76
 4.28 4.28 4.07 4.07 4.05 4.05 4.59 4.59 3.95 3.95 4.66 4.66 4.17 4.17
 4.   4.   4.68 4.68 4.61 4.61 4.52 4.52 3.72 3.72 4.81 4.81 4.28 4.28
 4.24 4.24 4.11 4.11 4.07 4.07 4.71 4.71 4.55 4.55 4.8  4.8  4.87 4.87
 4.89 4.89 4.5  4.5  4.4  4.4  4.16 4.16 4.72 4.72 4.38 4.38 4.67 4.67
 4.37 4.37 4.   3.85 5.   5.   4.37 4.37 4.18 4.18 4.42 4.42 3.72 3.72
 4.59 4.59 4.92 4.92 4.15 4.15 4.83 4.83 4.63 4.63 4.39 4.39 4.06 4.06
 4.62 4.62 4.41 4.41 4.52 4.52 4.56 4.56 4.83 4.83 4.12 4.12 3.8  3.8
 3.89 3.89 4.12 4.12 4.44 4.44 4.39 4.39 4.22 4.22 4.21 4.21 

In [17]:
# calculate accuracy using sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# calculate accuracy using sklearn
print("Using sklearn")
accuracy = accuracy_score(binary_hidden_ratings, binary_prediction_ratings)
precision = precision_score(binary_hidden_ratings, binary_prediction_ratings)
recall = recall_score(binary_hidden_ratings, binary_prediction_ratings)
f1 = f1_score(binary_hidden_ratings, binary_prediction_ratings)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# calculate accuracy manually
print("\n\nManually")
true_positives = np.sum((binary_hidden_ratings == 1) & (binary_prediction_ratings == 1))
true_negatives = np.sum((binary_hidden_ratings == 0) & (binary_prediction_ratings == 0))
false_positives = np.sum((binary_hidden_ratings == 0) & (binary_prediction_ratings == 1))
false_negatives = np.sum((binary_hidden_ratings == 1) & (binary_prediction_ratings == 0))

accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1 = 2 * precision * recall / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Using sklearn
Accuracy: 0.8603491271820449
Precision: 0.8727034120734908
Recall: 0.9779411764705882
F1 Score: 0.9223300970873787


Manually
Accuracy: 0.8603491271820449
Precision: 0.8727034120734908
Recall: 0.9779411764705882
F1 Score: 0.9223300970873787


***
# Sandbox

Here we will test out the workings of item based collaborative filtering. The steps are as follows:

1. Have User Item matrix
2. Hide some ratings to simulate a test set
3. Calculate similarity (cosine similarity)
4. Calculate weighted average of ratings
5. Fill in missing values with predicted ratings
6. Take the predicted ratings and compare them to the hidden ratings
7. Calculate MAE, RMSE, MSE
8. Binarise the ratings 
9. Calculate classification metrics


In [2]:
%reset -f

# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
x = pd.read_csv(r"C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\temp_data.csv", index_col=0)
x

Unnamed: 0,book1,book2,book3,book4,book5,book6,book7,book8,book9,book10
user1,0,0,2,5,4,3,4,4,4,4
user2,4,0,3,5,0,0,0,0,0,4
user3,0,3,4,4,0,2,0,0,0,0
user4,0,0,3,5,4,0,0,0,0,0
user5,3,4,0,4,4,0,5,5,5,5
user6,4,5,0,0,0,0,4,2,2,0
user7,2,2,0,0,0,0,5,3,3,3
user8,0,5,4,0,4,3,0,0,0,0
user9,0,5,4,0,5,2,0,2,2,0
user10,0,0,0,0,5,0,4,4,4,4


In [6]:
# create a copy of the original matrix to store hidden ratings
x_hidden = x.copy()
indices_tracker = []

# identifies rated books and randomly selects 2 books to hide ratings for each user
np.random.seed(10)  # You can use any integer value as the seed
for user_id in range(x_hidden.shape[0]):
    rated_books = np.where(x_hidden.iloc[user_id, :] > 0)[0]
    print("User:", user_id)
    print("Indices of Rated Books:", rated_books)
    hidden_indices = np.random.choice(rated_books, min(2, len(rated_books)), replace=False)
    indices_tracker.append(hidden_indices)
    print("Indices to Hide:", hidden_indices, "\n")
    x_hidden.iloc[user_id, hidden_indices] = 0


User: 0
Indices of Rated Books: [2 3 4 5 6 7 8 9]
Indices to Hide: [4 5] 

User: 1
Indices of Rated Books: [0 2 3 9]
Indices to Hide: [3 9] 

User: 2
Indices of Rated Books: [1 2 3 5]
Indices to Hide: [5 1] 

User: 3
Indices of Rated Books: [2 3 4]
Indices to Hide: [4 2] 

User: 4
Indices of Rated Books: [0 1 3 4 6 7 8 9]
Indices to Hide: [9 1] 

User: 5
Indices of Rated Books: [0 1 6 7 8]
Indices to Hide: [8 1] 

User: 6
Indices of Rated Books: [0 1 6 7 8 9]
Indices to Hide: [8 9] 

User: 7
Indices of Rated Books: [1 2 4 5]
Indices to Hide: [1 5] 

User: 8
Indices of Rated Books: [1 2 4 5 7 8]
Indices to Hide: [1 7] 

User: 9
Indices of Rated Books: [4 6 7 8 9]
Indices to Hide: [6 4] 

User: 10
Indices of Rated Books: [0 1 2 4 6 7 8]
Indices to Hide: [2 0] 

User: 11
Indices of Rated Books: [0 1 2 4 5 6 7 8]
Indices to Hide: [1 7] 



In [9]:
# check tracker - all hidden ratings 
indices_tracker = pd.DataFrame(indices_tracker).to_numpy()
print("Indices of Ratings per user \n", indices_tracker)

# flattened
indices_tracker_flat = indices_tracker.flatten()
print("Indices of Ratings per User joined", indices_tracker_flat)


Indices of Ratings per user 
 [[4 5]
 [3 9]
 [5 1]
 [4 2]
 [9 1]
 [8 1]
 [8 9]
 [1 5]
 [1 7]
 [6 4]
 [2 0]
 [1 7]]
Indices of Ratings per User joined [4 5 3 9 5 1 4 2 9 1 8 1 8 9 1 5 1 7 6 4 2 0 1 7]


In [10]:
# see updated matrix with hidden ratings
print("Updated Matrix with Hidden Ratings")
display(x_hidden)

# see original matrix
print("Original Matrix")
display(x)

Updated Matrix with Hidden Ratings


Unnamed: 0,book1,book2,book3,book4,book5,book6,book7,book8,book9,book10
user1,0,0,2,5,0,0,4,4,4,4
user2,4,0,3,0,0,0,0,0,0,0
user3,0,0,4,4,0,0,0,0,0,0
user4,0,0,0,5,0,0,0,0,0,0
user5,3,0,0,4,4,0,5,5,5,0
user6,4,0,0,0,0,0,4,2,0,0
user7,2,2,0,0,0,0,5,3,0,0
user8,0,0,4,0,4,0,0,0,0,0
user9,0,0,4,0,5,2,0,0,2,0
user10,0,0,0,0,0,0,0,4,4,4


Original Matrix


Unnamed: 0,book1,book2,book3,book4,book5,book6,book7,book8,book9,book10
user1,0,0,2,5,4,3,4,4,4,4
user2,4,0,3,5,0,0,0,0,0,4
user3,0,3,4,4,0,2,0,0,0,0
user4,0,0,3,5,4,0,0,0,0,0
user5,3,4,0,4,4,0,5,5,5,5
user6,4,5,0,0,0,0,4,2,2,0
user7,2,2,0,0,0,0,5,3,3,3
user8,0,5,4,0,4,3,0,0,0,0
user9,0,5,4,0,5,2,0,2,2,0
user10,0,0,0,0,5,0,4,4,4,4


In [11]:
# get cosine sim matrix and change to pd dataframe and save to csv
pd.DataFrame(cosine_similarity(x_hidden.T).round(2), index=x.columns, columns=x.columns).to_csv(r"C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\temp_data_sim_mat_cosine.csv")
sim_mat_cos = cosine_similarity(x_hidden.T).round(2)
print("Cosine Similarity Matrix") 
sim_mat_cos

Cosine Similarity Matrix


array([[1.  , 0.17, 0.39, 0.16, 0.28, 0.5 , 0.62, 0.36, 0.37, 0.  ],
       [0.17, 1.  , 0.  , 0.  , 0.38, 0.  , 0.66, 0.58, 0.36, 0.  ],
       [0.39, 0.  , 1.  , 0.34, 0.54, 0.56, 0.19, 0.1 , 0.31, 0.17],
       [0.16, 0.  , 0.34, 1.  , 0.19, 0.  , 0.41, 0.45, 0.45, 0.39],
       [0.28, 0.38, 0.54, 0.19, 1.  , 0.48, 0.51, 0.5 , 0.67, 0.  ],
       [0.5 , 0.  , 0.56, 0.  , 0.48, 1.  , 0.23, 0.  , 0.37, 0.  ],
       [0.62, 0.66, 0.19, 0.41, 0.51, 0.23, 1.  , 0.85, 0.71, 0.26],
       [0.36, 0.58, 0.1 , 0.45, 0.5 , 0.  , 0.85, 1.  , 0.86, 0.58],
       [0.37, 0.36, 0.31, 0.45, 0.67, 0.37, 0.71, 0.86, 1.  , 0.58],
       [0.  , 0.  , 0.17, 0.39, 0.  , 0.  , 0.26, 0.58, 0.58, 1.  ]])

In [13]:
# get a predictions matrix
predic_matrix = x_hidden.copy()

# get predicted ratings for unread books for user 1 using cosine similarity
user_ratings = predic_matrix.iloc[0, :].values.reshape(1, -1)
unread_books_indices = np.where(user_ratings == 0)[1]
rated_books_indices = np.where(user_ratings > 0)[1]

for book_id in unread_books_indices:
    similarity_i_j = sim_mat_cos[book_id, rated_books_indices]
    ratings = user_ratings[0, rated_books_indices]
    predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))
    predic_matrix.iloc[0, book_id] = predicted_rating.round(2)

# see updated matrix with predicted ratings
print("Predicted Ratings for User 1")
display(predic_matrix)

# save to csv
predic_matrix.to_csv(r"C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\temp_data_predic_matrix_cosine.csv")


Predicted Ratings for User 1


Unnamed: 0,book1,book2,book3,book4,book5,book6,book7,book8,book9,book10
user1,3.67,4,2,5,3.63,3.03,4,4,4,4
user2,4.0,0,3,0,0.0,0.0,0,0,0,0
user3,0.0,0,4,4,0.0,0.0,0,0,0,0
user4,0.0,0,0,5,0.0,0.0,0,0,0,0
user5,3.0,0,0,4,4.0,0.0,5,5,5,0
user6,4.0,0,0,0,0.0,0.0,4,2,0,0
user7,2.0,2,0,0,0.0,0.0,5,3,0,0
user8,0.0,0,4,0,4.0,0.0,0,0,0,0
user9,0.0,0,4,0,5.0,2.0,0,0,2,0
user10,0.0,0,0,0,0.0,0.0,0,4,4,4


In [14]:
# now get predicted ratings for all users
for user_id in range(predic_matrix.shape[0]):
    user_ratings = predic_matrix.iloc[user_id, :].values.reshape(1, -1)
    unread_books_indices = np.where(user_ratings == 0)[1]
    rated_books_indices = np.where(user_ratings > 0)[1]
    for book_id in unread_books_indices:
        similarity_i_j = sim_mat_cos[book_id, rated_books_indices]
        ratings = user_ratings[0, rated_books_indices]
        
        if np.any(similarity_i_j):
            predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))
        else:
            # make predicted rating mean of user's ratings
            predicted_rating = np.mean(ratings)
        
        predic_matrix.iloc[user_id, book_id] = predicted_rating.round(2)

# see updated matrix with predicted ratings
print("Predicted Ratings for All Users")
display(predic_matrix)

Predicted Ratings for All Users


Unnamed: 0,book1,book2,book3,book4,book5,book6,book7,book8,book9,book10
user1,3.67,4.0,2.0,5.0,3.63,3.03,4.0,4.0,4.0,4.0
user2,4.0,4.0,3.0,3.32,3.34,3.47,3.77,3.78,3.54,3.0
user3,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
user4,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
user5,3.0,4.67,4.11,4.0,4.0,4.06,5.0,5.0,5.0,4.78
user6,4.0,3.18,3.71,3.12,3.22,4.0,4.0,2.0,3.11,2.62
user7,2.0,2.0,2.99,3.65,3.22,2.95,5.0,3.0,3.3,3.62
user8,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
user9,3.05,3.54,4.0,3.28,5.0,2.0,3.16,3.16,2.0,2.45
user10,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [17]:
# now evaluate how good the predictions are vs the hidden ratings
# step 1: identify the hidden ratings indices
# step 2: extract hidden ratings indices and corresponding predicted ratings indices
# step 3: calculate MAE, MSE and RMSE (take the hidden ratings as the true values and the predicted ratings as the predicted values)
# step 4:  binarise to get classification metrics

# step 1: identify the hidden ratings indices = indices_tracker and get the hidden ratings ==========================================================================
hidden_ratings_ind = indices_tracker.copy()

# Loop through users to append hidden ratings
hidden_ratings_arrays = []

# Loop through users to append hidden ratings arrays
for user in range(x.shape[0]):
    user_hidden_ratings = x.iloc[user, hidden_ratings_ind[user, :]].reset_index(drop=True).values
    hidden_ratings_arrays.append(user_hidden_ratings)


hidden_ratings_array = pd.DataFrame(hidden_ratings_arrays).to_numpy().flatten()
print("Hidden Ratings:", hidden_ratings_array)


Hidden Ratings: [4 3 5 4 2 3 4 3 5 4 2 5 3 3 5 3 5 2 4 5 2 4 5 3]


In [18]:
# step 2: extract corresponding predicted ratings indices ==========================================================================

# Create an empty list to store predicted ratings arrays
predicted_ratings_arrays = []

# Loop through users to append predicted ratings arrays
for user in range(predic_matrix.shape[0]):
    user_predicted_ratings = predic_matrix.iloc[user, hidden_ratings_ind[user, :]].reset_index(drop=True).values
    predicted_ratings_arrays.append(user_predicted_ratings)

predicted_ratings_array = pd.DataFrame(predicted_ratings_arrays).to_numpy().flatten()
print("Corresponding Predicted Ratings:", predicted_ratings_array)


Corresponding Predicted Ratings: [3.63 3.03 3.32 3.   4.   4.   5.   5.   4.78 4.67 3.11 3.18 3.3  3.62
 4.   4.   3.54 3.16 4.   4.   5.   4.72 2.97 3.08]


In [22]:
# step 3: calculate MAE, MSE and RMSE (take the hidden ratings as the true values and the predicted ratings as the predicted values) ==========================================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error

# calculate MAE, MSE and RMSE
print("Using sklearn")
mae = mean_absolute_error(hidden_ratings_array, predicted_ratings_array)
mse = mean_squared_error(hidden_ratings_array, predicted_ratings_array)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


# Manually
print("\n\nManually")
mae = np.mean(np.abs(hidden_ratings_array - predicted_ratings_array)) # Calculate Mean Absolute Error (MAE)
mse = np.mean((hidden_ratings_array - predicted_ratings_array) ** 2) # Calculate Mean Squared Error (MSE)
rmse = np.sqrt(mse) # Calculate Root Mean Squared Error (RMSE)


print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Using sklearn
Mean Absolute Error (MAE): 1.0529166666666667
Mean Squared Error (MSE): 1.6499708333333334
Root Mean Squared Error (RMSE): 1.2845119047067386


Manually
Mean Absolute Error (MAE): 1.0529166666666667
Mean Squared Error (MSE): 1.6499708333333334
Root Mean Squared Error (RMSE): 1.2845119047067386


In [30]:
# step 4: calculate Classification Metrics (take the hidden ratings and the predicted ratings and binarise them) ==========================================================================

# Binarise the hidden ratings and predicted ratings
threshold = 3.5
binary_prediction_ratings = (predicted_ratings_array >= threshold).astype(int) 
print(f"If predicted rating is greater than or equal to {threshold}, then 1, else 0\n")
print("Predicted Ratings:", predicted_ratings_array)
print("Binary Predictions:", binary_prediction_ratings)
binary_hidden_ratings = (hidden_ratings_array >= threshold).astype(int)
print("\n")

print("Hidden Ratings:", hidden_ratings_array)
print("Binary Hidden Ratings:", binary_hidden_ratings)

If predicted rating is greater than or equal to 3.5, then 1, else 0

Predicted Ratings: [3.63 3.03 3.32 3.   4.   4.   5.   5.   4.78 4.67 3.11 3.18 3.3  3.62
 4.   4.   3.54 3.16 4.   4.   5.   4.72 2.97 3.08]
Binary Predictions: [1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0]


Hidden Ratings: [4 3 5 4 2 3 4 3 5 4 2 5 3 3 5 3 5 2 4 5 2 4 5 3]
Binary Hidden Ratings: [1 0 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 1 1 0]


In [31]:
# calculate accuracy using sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# calculate accuracy using sklearn
print("Using sklearn")
accuracy = accuracy_score(binary_hidden_ratings, binary_prediction_ratings)
precision = precision_score(binary_hidden_ratings, binary_prediction_ratings)
recall = recall_score(binary_hidden_ratings, binary_prediction_ratings)
f1 = f1_score(binary_hidden_ratings, binary_prediction_ratings)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# calculate accuracy manually
print("\n\nManually")
true_positives = np.sum((binary_hidden_ratings == 1) & (binary_prediction_ratings == 1))
true_negatives = np.sum((binary_hidden_ratings == 0) & (binary_prediction_ratings == 0))
false_positives = np.sum((binary_hidden_ratings == 0) & (binary_prediction_ratings == 1))
false_negatives = np.sum((binary_hidden_ratings == 1) & (binary_prediction_ratings == 0))

accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1 = 2 * precision * recall / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Using sklearn
Accuracy: 0.5833333333333334
Precision: 0.6
Recall: 0.6923076923076923
F1 Score: 0.6428571428571429


Manually
Accuracy: 0.5833333333333334
Precision: 0.6
Recall: 0.6923076923076923
F1 Score: 0.6428571428571429
