# User Based Collaborative Filtering
## Algorithm Summary

Item-based collaborative filtering is a model-based algorithm for making recommendations. It is based on the similarity between items calculated using people's ratings of those items. It is also known as item-item collaborative filtering.

1. **Load the data**
- data is provided in a dataframe where each row is a review

2. **Create a user-item matrix**
- convert dataframe into user-item matrix where each row is a user and each column is an item

3. **Create test, train and validation set**
- hide $N$ ratings for each user in the training set and use them to test the performance of the model. Use the other hidden ratings as validation set.

4. **Calculate user similarity**
- using training set, calculate the similarity between users using cosine similarity

5. **Make predictions**
- for each user, for each item in the test set, calculate the weighted sum of the ratings of the items that are similar to the item in question

6. **Evaluate the model**
- calculate the predictive accuracy of the model using RMSE, MSE and MAE
- calculate the Top-N metrics of the model using NDCG and Hit Rate

## (1) Manaul / From Fundamentals

In [None]:
# %reset -f
# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load data - WINDOWS
# amz_data = pd.read_csv(r'C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\Data\set2_data_modelling.csv')
# display(amz_data.head())

# load data - MAC OS
amz_data = pd.read_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/set3_data_modelling.csv')
display(amz_data.head(3))

# print details
print('Number of Rows: ', amz_data.shape[0])
print('Number of Columns: ', amz_data.shape[1])
print('Number of Unique Users: ', len(amz_data['reviewerID'].unique()))
print('Number of Unique Products: ', len(amz_data['asin'].unique()))
print('Fewest reviews by a reviewer:', amz_data.groupby('reviewerID')['asin'].count().min())
print('Most reviews by a reviewer:', amz_data.groupby('reviewerID')['asin'].count().max())
print("Fewest reviews per product:", amz_data.groupby('asin')['reviewerID'].count().min())
print("Most reviews per product:", amz_data.groupby('asin')['reviewerID'].count().max())


# Creating User Item Matrix =====================================================
# create user-item matrix
x = amz_data.pivot_table(index='reviewerID', columns='asin', values='overall')
x = x.fillna(0)
print("Shape: ", x.shape)

### Train and Test Split

In [None]:
# create a copy of the original matrix to store hidden ratings
x_hidden = x.copy()
indices_tracker = []

# number of products to hide for each user
N = 3

# identifies rated items and randomly selects N products to hide ratings for each user
np.random.seed(2207)  # You can use any integer value as the seed
for user_id in range(x_hidden.shape[0]):
    rated_products = np.where(x_hidden.iloc[user_id, :] > 0)[0]
    # print("User:", user_id)
    # print("Indices of Rated Products:", rated_products)
    hidden_indices = np.random.choice(rated_products, N, replace=False)
    indices_tracker.append(hidden_indices)
    # print("Indices to Hide:", hidden_indices, "\n")
    x_hidden.iloc[user_id, hidden_indices] = 0

In [None]:
# hide another 2 ratings for validation
N = 2
indices_tracker_val = []

# identifies rated items and randomly selects N products to hide ratings for each user
np.random.seed(2207)  
for user_id in range(x_hidden.shape[0]):
    rated_products = np.where(x_hidden.iloc[user_id, :] > 0)[0]
    # print("User:", user_id)
    # print("Indices of Rated Products:", rated_products)
    hidden_indices = np.random.choice(rated_products, N, replace=False)
    indices_tracker_val.append(hidden_indices)
    # print("Indices to Hide:", hidden_indices, "\n")
    x_hidden.iloc[user_id, hidden_indices] = 0

In [None]:
# check tracker - all hidden ratings 
indices_tracker = pd.DataFrame(indices_tracker).to_numpy()
print("Indices of Ratings per user \n", indices_tracker)

# flattened
indices_tracker_flat = indices_tracker.flatten()
print("Indices of Ratings per User joined", indices_tracker_flat)

# see updated matrix with hidden ratings
print("Updated Matrix with Hidden Ratings")
display(x_hidden)

# see original matrix
print("Original Matrix")
display(x)

### Similarity Matrix

In [None]:
# get cosine sim matrix and change to pd dataframe and save to csv
sim_mat_cos = cosine_similarity(x_hidden).round(5)
print("Cosine Similarity Matrix") 
sim_mat_cos

### Grid Search

In [None]:
# set k 
k = [5,10,20,40]

# get predicted ratings for all users for each K, calculate RMSE and store in a list
for k in k:
    print("K = ", k)
    # get top k similar users
    top_k_users = np.argsort(-sim_mat_cos, axis=1)[:, 1:k+1]
    print("Top K Similar Users")
    print(top_k_users)
    
    # get predicted ratings for all users
    predicted_ratings = np.zeros(x_hidden.shape)
    for i in range(x_hidden.shape[0]):
        for j in range(x_hidden.shape[1]):
            if x_hidden.iloc[i, j] == 0:
                predicted_ratings[i, j] = np.mean(x_hidden.iloc[top_k_users[i], j])
            else:
                predicted_ratings[i, j] = x_hidden.iloc[i, j]
    print("Predicted Ratings")
    print(predicted_ratings)
    
    # calculate RMSE
    rmse = np.sqrt(np.sum((predicted_ratings - x)**2) / (x.shape[0] * x.shape[1]))
    print("RMSE: ", rmse)
    
    # store RMSE in a list
    rmse_list.append(rmse)
    print("\n")

# see best k 
print(rmse)

### Prediction Matrix

In [None]:
# get a predictions matrix
predic_matrix = x_hidden.copy()

# set k to 40
k = 40

# now get predicted ratings for all users
for user_id in range(predic_matrix.shape[0]):
    user_ratings = predic_matrix.iloc[user_id, :].values.reshape(1, -1)
    unrated_products_indices = np.where(user_ratings == 0)[1]
    rated_users_indices = np.where(user_ratings > 0)[1]
    for product_id in unrated_products_indices:
        similarity_i_j = sim_mat_cos[user_id, rated_users_indices]  # Get similarity between this user and other users who rated this product
        ratings = user_ratings[0, rated_users_indices]
        
        # sort by similarity and select top k
        sorted_indices = np.argsort(similarity_i_j)[::-1][:k]
        similarity_i_j = similarity_i_j[sorted_indices]
        ratings = ratings[sorted_indices]

        if np.any(similarity_i_j):
            predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))
        else:
            # make predicted rating mean of user's ratings
            predicted_rating = np.mean(ratings)
        
        predic_matrix.iloc[user_id, product_id] = predicted_rating

# see updated matrix with predicted ratings
print("Predicted Ratings for All Users")
display(predic_matrix)

### Evaluation (Predictive Accuracy)

Now evaluate how good the predictions are vs the hidden ratings
- ***step 1***: identify the hidden ratings indices
- ***step 2***: extract hidden ratings indices and corresponding predicted ratings indices
- ***step 3***: calculate MAE, MSE and RMSE (take the hidden ratings as the true values and the predicted ratings as the predicted values

In [None]:
# step 1: identify the hidden ratings indices = indices_tracker and get the hidden ratings ==========================================================================
hidden_ratings_ind = indices_tracker.copy()

# Loop through users to append hidden ratings
hidden_ratings_arrays = []

# Loop through users to append hidden ratings arrays
for user in range(x.shape[0]):
    user_hidden_ratings = x.iloc[user, hidden_ratings_ind[user, :]].reset_index(drop=True).values
    hidden_ratings_arrays.append(user_hidden_ratings)


hidden_ratings_array = pd.DataFrame(hidden_ratings_arrays).to_numpy().flatten()
print("Hidden Ratings:", hidden_ratings_array)

# step 2: extract corresponding predicted ratings indices ==========================================================================

# Create an empty list to store predicted ratings arrays
predicted_ratings_arrays = []

# Loop through users to append predicted ratings arrays
for user in range(predic_matrix.shape[0]):
    user_predicted_ratings = predic_matrix.iloc[user, hidden_ratings_ind[user, :]].reset_index(drop=True).values
    predicted_ratings_arrays.append(user_predicted_ratings)

predicted_ratings_array = pd.DataFrame(predicted_ratings_arrays).to_numpy().flatten()
print("Corresponding Predicted Ratings:", predicted_ratings_array)

# step 3: calculate MAE, MSE and RMSE (take the hidden ratings as the true values and the predicted ratings as the predicted values) ==========================================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error

# calculate MAE, MSE and RMSE
print("Using sklearn")
mae = mean_absolute_error(hidden_ratings_array, predicted_ratings_array)
mse = mean_squared_error(hidden_ratings_array, predicted_ratings_array)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


# Manually
print("\n\nManually")
mae = np.mean(np.abs(hidden_ratings_array - predicted_ratings_array)) # Calculate Mean Absolute Error (MAE)
mse = np.mean((hidden_ratings_array - predicted_ratings_array) ** 2) # Calculate Mean Squared Error (MSE)
rmse = np.sqrt(mse) # Calculate Root Mean Squared Error (RMSE)


print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
# round to 2 decimal places
mae = round(mae, 2)
mse = round(mse, 2)
rmse = round(rmse, 2)

# save results to csv
results = pd.DataFrame({'MAE': [mae.round(3)], 'MSE': [mse.round(3)], 'RMSE': [rmse.round(3)]})
results.to_csv("Data/Results/UBCF_results_1.csv", index=False)

### Evaluation (Top-N Metrics)

In [None]:
# turn matrix into a dataframe with user and product, rating columns
preds_series = predic_matrix.stack().reset_index().rename(columns={0: 'rating'}).sort_values(by=['asin', 'reviewerID'])
preds_series = preds_series['rating'].reset_index(drop=True)
preds_series


# getting a dataframe with interactions and ratings
data = amz_data.pivot_table(index='reviewerID', columns='asin', values='overall')
data_mat = data.copy()
data_mat = data_mat.reset_index()
data_mat = data_mat.melt(id_vars=data_mat.columns[0], var_name='product', value_name='rating')
data_mat.columns = ['user', 'product', 'rating']
data_mat['user'] = data_mat['user'].astype('category')
data_mat['product'] = data_mat['product'].astype('category')

# data_mat['user'] = data_mat['user'].cat.codes
# data_mat['product'] = data_mat['product'].cat.codes
display(data_mat.head(3))

# create a completed dataframe
completed = data_mat.copy()
nan_rows = completed[completed['rating'].isnull()]

# for nan_rows, replace the rating with the predicted rating
completed.loc[nan_rows.index, 'rating'] = preds_series[nan_rows.index]

# see original data with user item interactions
print("User Item Interactions with Ratings")
display(data_mat.head(3))

# see data with predictions
print("\nUser Item Interactions with Predicted Ratings")
display(completed.head(3))

# details on completed dataframe
print('\n\nNumber of Rows: ', completed.shape[0])
print('Number of Columns: ', completed.shape[1])
print('Number of Unique Users: ', len(completed['user'].unique()))
print('Number of Unique Products: ', len(completed['product'].unique()))

#### Execute for One User

In [None]:
# create a copy of the original matrix to store hidden ratings
x_hidden = x.copy()
indices_tracker = []

# number of products to hide for each user
N = 3

# identifies rated items and randomly selects N products to hide ratings for each user
np.random.seed(2207)  # You can use any integer value as the seed
for user_id in range(x_hidden.shape[0]):
    rated_products = np.where(x_hidden.iloc[user_id, :] > 0)[0]
    hidden_indices = np.random.choice(rated_products, N, replace=False)
    indices_tracker.append(hidden_indices)
    # print("Indices to Hide:", hidden_indices, "\n")
    x_hidden.iloc[user_id, hidden_indices] = 0

In [None]:
# get training data for user (i.e., remove the hidden ratings and keep only the observed ratings)
train_data = x_hidden.copy()
train_data = train_data.stack().reset_index().rename(columns={0: 'rating'}).sort_values(by=['reviewerID', 'asin'])

# remove all zeros and nan values
train_data = train_data[(train_data != 0)]

# remove all nan values
train_data = train_data.dropna()


# apply cat codes to the user and product columns
train_data['reviewerID'] = train_data['reviewerID'].astype('category')
train_data['asin'] = train_data['asin'].astype('category')

# train_data['reviewerID'] = train_data['reviewerID'].cat.codes
# train_data['asin'] = train_data['asin'].cat.codes
train_data.rename(columns={'reviewerID': 'user', 'asin':'product' }, inplace=True)
train_data


In [None]:
# set N - number of recommendations
N = 10000

# get interactions for user 1 used for training
train_x_user_1 = train_data[train_data['user'] == 'A100RH4M1W1DF0']
train_x_user_1

# Get interactions for User 1 (including ratings)
user_1 = completed[completed['user'] == 'A100RH4M1W1DF0']
print("Number of Interactions for User 1: ", user_1.shape[0])

# Identify liked items for User 1 (above a threshold, e.g., rating > 3)
liked_items = user_1[user_1['rating'] > 3.5]
print("Number of Liked Items for User 1: ", liked_items.shape[0])

# get items that were hidden for user 1 (get product names)
product_ids_hidden = x.iloc[0, indices_tracker[0]].index
product_ids_hidden = product_ids_hidden.tolist()
print("Number of Hidden Items for User 1: ", len(product_ids_hidden))

# get ratings for hidden items and predicted ratings - for user 1 (put in a dataframe)
hidden_ratings = x.iloc[0, indices_tracker[0]].values
predicted_ratings = predic_matrix.iloc[0, indices_tracker[0]].values
hidden_ratings_df = pd.DataFrame({'product': product_ids_hidden, 'hidden_rating': hidden_ratings, 'predicted_rating': predicted_ratings})
hidden_ratings_df

# set  threshold for recommendations
threshold = 3.
# create a label column for hidden ratings (1 = liked, 0 = not liked)
hidden_ratings_df['label'] = hidden_ratings_df['hidden_rating'].apply(lambda x: 1 if x > threshold else 0)
hidden_ratings_df

# add label for used interactions (add 1 to all interactions that exist in training data)
user_1['used_ind'] = 0
for i in range(user_1.shape[0]):
    if user_1.iloc[i, 1] in list(train_x_user_1['product']):
        user_1.iloc[i, 3] = 1

# count how many interactions are in train_x
print("Number of Interactions in Train Set for User 1: ", train_x_user_1.shape[0])

# count how many 1 in completed_user_1
print("Number of Interactions in Completed User 1: ", user_1[user_1['used_ind'] == 1].shape[0])

# add label liked for completed_user_1
user_1['liked'] = user_1['rating'].apply(lambda x: 1 if x > threshold else 0)


# add a label column to user_1_top_n: test_ind (if the product is in hidden_ratings_df, then 1, else 0)
user_1['test_ind'] = 0
for i in range(user_1.shape[0]):
    if user_1.iloc[i, 1] in list(hidden_ratings_df['product']):
        user_1.iloc[i, 5] = 1

# for all records where test_ind = 1, replace the hidden_rating with predicted_rating
for i in range(user_1.shape[0]):
    if user_1.iloc[i, 5] == 1:
        user_1.iloc[i, 2] = hidden_ratings_df[hidden_ratings_df['product'] == user_1.iloc[i, 1]]['predicted_rating'].values[0] 

# get top N recommendations for user 1 - exclude items where used_ind = 1
user_1_top_n = user_1[user_1['used_ind'] == 0]
user_1_top_n = user_1_top_n.sort_values(by='rating', ascending=False)
user_1_top_n = user_1_top_n.head(N)

# count how many 1 in user_1_top_n
print("Number of Items in Top N for User 1 that Were Used and Liked: ", user_1_top_n[user_1_top_n['test_ind'] == 1].shape[0])

# see top N recommendations for user 1
print("\n\nTop N Recommendations for User 1")
display(user_1_top_n)

# Calculate precision@K (top N recommendations)
precision_at_N = user_1_top_n['test_ind'].sum() / N

# Calculate recall@K
recall_at_N = user_1_top_n['test_ind'].sum() / liked_items.shape[0]

# calculate F1 score
f1_at_N = 2 * (precision_at_N * recall_at_N) / (precision_at_N + recall_at_N)

print(f"Precision@{N}: {precision_at_N:.4f}")
print(f"Recall@{N}: {recall_at_N:.4f}")
print(f"F1@{N}: {f1_at_N:.4f}")

# save results to csv
results = pd.DataFrame({'Precision@N': [precision_at_N], 'Recall@N': [recall_at_N], 'F1@N': [f1_at_N]})
results


In [None]:
# convert to dataframe with columns: user, products
hid = pd.DataFrame(hidden_ratings_ind)
hid['user'] = x.index
hid = hid[['user', 0, 1, 2]]

# convert 0,1,2 to list
hid['products'] = hid.iloc[:, 1:].values.tolist()
hid = hid[['user', 'products']]
hid

In [None]:
def evaluate_topN_user(user_id, threshold, N):
    print(f"Evaluating User {user_id}")
    
    train_x_user = train_data[train_data['user'] == user_id]
    user_data = completed[completed['user'] == user_id]
    
    liked_items = user_data[user_data['rating'] > threshold]
    product_ids_hidden = x.iloc[0, indices_tracker[0]].index.tolist()
    

    all_ints = x.loc[user_id, :]
    product_names = all_ints.index[hid[hid['user'] == user_id]['products'].values[0]]

    hidden_ratings = x.loc[user_id, product_names].values
    predicted_ratings = predic_matrix.loc[user_id, product_names].values
    
    hidden_ratings_df = pd.DataFrame({
        'product': product_ids_hidden,
        'hidden_rating': hidden_ratings,
        'predicted_rating': predicted_ratings
    })

    hidden_ratings_df['label'] = hidden_ratings_df['hidden_rating'].apply(lambda x: 1 if x > threshold else 0)

    user_data['used_ind'] = 0
    user_data['liked'] = user_data['rating'].apply(lambda x: 1 if x > threshold else 0)

    user_data['test_ind'] = user_data['product'].apply(lambda x: 1 if x in hidden_ratings_df['product'].tolist() else 0)

    for i in range(user_data.shape[0]):
        if user_data.iloc[i, 5] == 1:
            user_data.iloc[i, 2] = hidden_ratings_df[hidden_ratings_df['product'] == user_data.iloc[i, 1]]['predicted_rating'].values[0]

    user_top_n = user_data[user_data['used_ind'] == 0].sort_values(by='rating', ascending=False).head(N)
    display(user_top_n)

    precision_at_N = user_top_n['test_ind'].sum() / N
    recall_at_N = user_top_n['test_ind'].sum() / liked_items.shape[0]

    if precision_at_N + recall_at_N == 0:
        f1_at_N = 0
    else:
        f1_at_N = 2 * (precision_at_N * recall_at_N) / (precision_at_N + recall_at_N)

    results = pd.DataFrame({'Precision@N': [precision_at_N], 'Recall@N': [recall_at_N], 'F1@N': [f1_at_N]})
    return results


In [None]:
# Now you can call evaluate_user_recommendation with different user_id, threshold, and N
results = evaluate_topN_user(user_id='A214JN9AJNSHCJ', threshold=3.5, N=1000)
results

#### Execute for All Users

In [None]:
# get count of users
user_count = len(completed['user'].unique())
counter = 0

# loop through users to get results for each user and save to a dataframe
results = pd.DataFrame()
for user in completed['user'].unique():
    counter += 1
    print(f"User {counter} of {user_count}")
    user_results = evaluate_topN_user(user_id=user, threshold=3, N=10000)
    print(user_results)
    results = pd.concat([results, user_results])
    

results

In [None]:
# Get the average results for all users
average_results = results.mean()
average_results

In [None]:
# calculate recall, using precision and f1 only. 
# Recall = 2 * (precision * f1) / (precision + f1)
average_results['Recall@N'] = 2 * (average_results['Precision@N'] * average_results['F1@N']) / (average_results['Precision@N'] + average_results['F1@N'])
average_results

In [None]:
precision_at_N = average_results['Precision@N']
recall_at_N = average_results['Recall@N']
f1_at_N = average_results['F1@N']

In [None]:
average_results = pd.DataFrame({'Precision@N': [precision_at_N], 'Recall@N': [recall_at_N], 'F1@N': [f1_at_N]})
average_results.to_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/Results/UBCF_results_1_top10000.csv', index=False)

## (2) Using Packages

In [None]:
## Using Packages for IBCF
import surprise
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import train_test_split


In [None]:
# load and Change data to User-`Item-`Rating format
# amz_data = pd.read_csv(r'C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\Data\set2_data_modelling.csv')
amz_data = pd.read_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/set2_data_modelling.csv', index_col=0)
display(amz_data.head())

x = amz_data.pivot_table(index='reviewerID', columns='asin', values='overall')
x = x.fillna(0)
print("\n\n\nUser-Item Matrix")
display(x.head())
print('Shape: ', x.shape)

In [None]:
# Import necessary libraries
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split

# Assume you have a user-item matrix 'user_item_matrix'
# Convert the user-item matrix back to a DataFrame of ratings
ratings = x.stack().reset_index()
ratings.columns = ['user', 'item', 'rating']

# Remove rows where rating is 0
ratings = ratings[ratings['rating'] != 0]

# Define a Reader object
# The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader(rating_scale=(1, 5))

# Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

# Split the dataset into train and test
# Test set is made of 25% of the ratings
trainset, testset = train_test_split(data, test_size=.25, random_state=2207)

In [None]:
# Configure the algorithm - User Based Collaborative Filtering
# Use cosine similarity
sim_options = {
    'name': 'cosine',
    'user_based': True  # this will compute similarity between users
}

# set k
k = 40

# Create an instance of KNNBasic
algo = KNNBasic(sim_options=sim_options, k=k, verbose=True,random_state=2207)

# Train the algorithm on the trainset
algo.fit(trainset)

# Predict ratings for the testset
predictions = algo.test(testset)

# Then compute RMSE, MSE and MAE
print("\nUser-based Model Test Set Results:")
mae_pack = accuracy.mae(predictions).round(2)
mse_pack = accuracy.mse(predictions).round(2)
rmse_pack = accuracy.rmse(predictions).round(2)

print(f"Mean Absolute Error (MAE): {mae_pack}")
print(f"Mean Squared Error (MSE): {mse_pack}")
print(f"Root Mean Squared Error (RMSE): {rmse_pack}")

In [None]:
# save results to csv
results = pd.DataFrame({'MAE': [mae_pack.round(3)], 'MSE': [mse_pack.round(3)], 'RMSE': [rmse_pack.round(3)]})
results.to_csv("Data/Results/UBCF_results_2.csv", index=False)

***
## (3) Manual Process with Same Data Splits

In [None]:
%reset -f

# load libraries
import surprise
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# load data - WINDOWS
# amz_data = pd.read_csv(r'C:\Users\e1002902\Documents\GitHub Repository\Masters-Dissertation\Code\Data\set2_data_modelling.csv')
# display(amz_data.head())

# load data - MAC OS
amz_data = pd.read_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/set2_data_modelling.csv')
display(amz_data.head(3))

# print details
print('Number of Rows: ', amz_data.shape[0])
print('Number of Columns: ', amz_data.shape[1])
print('Number of Unique Users: ', len(amz_data['reviewerID'].unique()))
print('Number of Unique Products: ', len(amz_data['asin'].unique()))
print('Fewest reviews by a reviewer:', amz_data.groupby('reviewerID')['asin'].count().min())
print('Most reviews by a reviewer:', amz_data.groupby('reviewerID')['asin'].count().max())
print("Fewest reviews per product:", amz_data.groupby('asin')['reviewerID'].count().min())
print("Most reviews per product:", amz_data.groupby('asin')['reviewerID'].count().max())


# Creating User Item Matrix =====================================================
# create user-item matrix
x = amz_data.pivot_table(index='reviewerID', columns='asin', values='overall')
x = x.fillna(0)
print("Shape: ", x.shape)

In [None]:
# using created testset from packages chapter
ratings = x.stack().reset_index()
ratings.columns = ['user', 'item', 'rating']
ratings = ratings[ratings['rating'] != 0]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings, reader)
trainset, testset = train_test_split(data, test_size=.25, random_state=2207)
testset_df = pd.DataFrame(testset)
testset_df = testset_df


# convert each row of the testset to a tuple
testset_tuples = [tuple(x) for x in testset_df[[0, 1]].to_numpy()]

# find indices of the testset in the original matrix
testset_indices = []
for i in range(len(testset_tuples)):
    user = testset_tuples[i][0]
    item = testset_tuples[i][1]
    user_index = x.index.get_loc(user)
    item_index = x.columns.get_loc(item)
    testset_indices.append((user_index, item_index))

# shorten the testset_indices to 100
testset_indices = testset_indices
print("Testset Indices: ")
testset_indices[0:5]

In [None]:
# # create a copy of the original matrix to store hidden ratings
# x_hidden = x.copy()
# indices_tracker = []

# # loop through the testset indices to hide the rating (make 0) - update x_hidden
# for user_id in range(x_hidden.shape[0]):
#     for item_id in range(x_hidden.shape[1]):
#         if (user_id, item_id) in testset_indices:
#             x_hidden.iloc[user_id, item_id] = 0

# # save x_hidden to csv
# x_hidden.to_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/suprise_hidden_ratings_matrix.csv')

In [None]:
# load hidden ratings matrix
x_hidden = pd.read_csv('/Users/pavansingh/Library/CloudStorage/GoogleDrive-pavansingho23@gmail.com/My Drive/Portfolio/Masters-Dissertation/Code/Data/suprise_hidden_ratings_matrix.csv', index_col=0)

In [None]:
# get cosine sim matrix and change to pd dataframe
sim_mat_cos = cosine_similarity(x_hidden)
print("Cosine Similarity Matrix") 
sim_mat_cos

In [None]:
# get a predictions matrix
predic_matrix = x_hidden.copy()

# set k to 40
k = 40

# now get predicted ratings for all users
for user_id in range(predic_matrix.shape[0]):
    user_ratings = predic_matrix.iloc[user_id, :].values.reshape(1, -1)
    unrated_products_indices = np.where(user_ratings == 0)[1]
    rated_users_indices = np.where(user_ratings > 0)[1]
    for product_id in unrated_products_indices:
        similarity_i_j = sim_mat_cos[user_id, rated_users_indices]  # Get similarity between this user and other users who rated this product
        ratings = user_ratings[0, rated_users_indices]
        
        # sort by similarity and select top k
        sorted_indices = np.argsort(similarity_i_j)[::-1][:k]
        similarity_i_j = similarity_i_j[sorted_indices]
        ratings = ratings[sorted_indices]

        if np.any(similarity_i_j):
            predicted_rating = np.sum(ratings * similarity_i_j) / np.sum(np.abs(similarity_i_j))
        else:
            # make predicted rating mean of user's ratings
            predicted_rating = np.mean(ratings)
        
        predic_matrix.iloc[user_id, product_id] = predicted_rating

# see updated matrix with predicted ratings
print("Predicted Ratings for All Users")
display(predic_matrix)

In [None]:
#  get predicted ratings for the testset
predicted_ratings = []
for i in range(len(testset_indices)):
    user_id = testset_indices[i][0]
    item_id = testset_indices[i][1]
    predicted_ratings.append(predic_matrix.iloc[user_id, item_id])

print("Predicted Ratings:")
print(predicted_ratings)


# get actual ratings for the testset
print("\nActual Ratings:")
actual_ratings = testset_df[2].to_list()
print(actual_ratings)

In [None]:
# calculate MAE, MSE and RMSE
from sklearn.metrics import mean_absolute_error, mean_squared_error
print("Using sklearn")
mae = mean_absolute_error(actual_ratings, predicted_ratings)
mse = mean_squared_error(actual_ratings, predicted_ratings)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae.round(2)}")
print(f"Mean Squared Error (MSE): {mse.round(2)}")
print(f"Root Mean Squared Error (RMSE): {rmse.round(2)}")


# Manually
print("\n\nManually")

# calculate MAE, MSE and RMSE using actual and predicted ratings
mae = np.mean(np.abs(np.array(actual_ratings) - np.array(predicted_ratings))) # Calculate Mean Absolute Error (MAE)
mse = np.mean((np.array(actual_ratings) - np.array(predicted_ratings)) ** 2) # Calculate Mean Squared Error (MSE)
rmse = np.sqrt(mse) # Calculate Root Mean Squared Error (RMSE)

print(f"Mean Absolute Error (MAE): {mae.round(2)}")
print(f"Mean Squared Error (MSE): {mse.round(2)}")
print(f"Root Mean Squared Error (RMSE): {rmse.round(2)}")

In [None]:
# save results to csv
results = pd.DataFrame({'MAE': [mae.round(3)], 'MSE': [mse.round(3)], 'RMSE': [rmse.round(3)]})
results.to_csv("Data/Results/UBCF_results_3.csv", index=False)
