In [5]:
# creating test and train data

In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import warnings

data_df = pd.read_csv('./ratings.dat', sep='::', names=["UserID", "MovieID", "Rating", "Timestamp"], engine='python')

# First, generate dictionaries for mapping old id to new id for users and movies
unique_MovieID = data_df['MovieID'].unique()
unique_UserID = data_df['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    j += 1
j = 0
movie_old2new_id_dict = dict()
for i in unique_MovieID:
    movie_old2new_id_dict[i] = j
    j += 1

# Then, use the generated dictionaries to reindex UserID and MovieID in the data_df
user_list = data_df['UserID'].values
movie_list = data_df['MovieID'].values
for j in range(len(data_df)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    movie_list[j] = movie_old2new_id_dict[movie_list[j]]
data_df['UserID'] = user_list
data_df['movieID'] = movie_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(data_df)) <= 0.7
train_df = data_df[train_index]
test_df = data_df[~train_index]

# generate train_mat and test_mat
num_user = len(data_df['UserID'].unique())
num_movie = len(data_df['MovieID'].unique())

train_mat = coo_matrix((train_df['Rating'].values, (train_df['UserID'].values, train_df['MovieID'].values)), shape=(num_user, num_movie)).astype(float).toarray()
test_mat = coo_matrix((test_df['Rating'].values, (test_df['UserID'].values, test_df['MovieID'].values)), shape=(num_user, num_movie)).astype(float).toarray()

In [7]:
# User-user Collaborative Filtering with Pearson Similarity

In [8]:
# import
import numpy as np
from scipy.stats import pearsonr

In [9]:
# mean rating for each user
#mean_ratings = np.nanmean(train_mat, axis=1)
mean_all_users = np.mean(train_mat[train_mat!=0])

# pearson correlation coefficients for user similarities
pearson_coefficient_similarity = np.corrcoef(train_mat)

prediction_mat = np.zeros_like(train_mat)
all_users = train_mat.shape[0]

mean_final = []

for user in range(all_users):
    if len(train_mat[i, train_mat[i] != 0]) > 0:
        mean_final.append(np.mean(train_mat[user, train_mat[user] != 0]))
    else:
        mean_final.append(mean_all_users)
        
for index in range(all_users):
    numerator = 0
    denominator = 0 
    similar_users = np.argsort(pearson_coefficient_similarity[index])[::-1]
    five_similar_users = similar_users[1:6]
    
    for i in range(train_mat.shape[1]):
        if train_mat[index, i] == 0: # if item for user is 0, then calculate
            for v in five_similar_users:
                if train_mat[v, i] != 0:
                    # calculate numerator and denominator
                    numerator += pearson_coefficient_similarity[index, v] * (train_mat[v, i] - mean_final[v])
                    denominator += np.abs(pearson_coefficient_similarity[index, v])
            if denominator != 0:
                prediction_mat[index, i] = mean_final[index] + (numerator / denominator)
            else:
                prediction_mat[index, i] = mean_final[index]



In [10]:
print(pearson_coefficient_similarity)

[[ 1.          0.06844549  0.12182549 ... -0.00522354  0.1164999
   0.11472079]
 [ 0.06844549  1.          0.10488302 ...  0.03471845  0.02423509
   0.10772135]
 [ 0.12182549  0.10488302  1.         ...  0.05403576  0.0595124
   0.06406599]
 ...
 [-0.00522354  0.03471845  0.05403576 ...  1.          0.08081921
   0.0324779 ]
 [ 0.1164999   0.02423509  0.0595124  ...  0.08081921  1.
   0.11056206]
 [ 0.11472079  0.10772135  0.06406599 ...  0.0324779   0.11056206
   1.        ]]


In [11]:
mean_squared_diff = np.mean((prediction_mat[test_mat != 0] - test_mat[test_mat != 0]) ** 2)

# calculate RMSE
rmse = np.sqrt(mean_squared_diff)

print("RMSE:", rmse)

RMSE: 1.0284831618203052
