In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
def rmse(y_hat, y_actual):
    return np.sqrt(mean_squared_error(y_actual, y_hat))

In [3]:
def mae(y_hat, y_actual):
    return mean_absolute_error(y_actual, y_hat)

In [4]:
ratings = pd.read_csv('./dataset/train.txt', names=['movieId', 'userId', 'rating'])
mean = ratings.groupby(by="userId",as_index=False)['rating'].mean()
rating_avg = pd.merge(ratings,mean,on='userId')

In [5]:
rating_avg.rename(columns={'rating_x': 'ratings', 'rating_y': 'mean'}, inplace=True)
rating_avg['normalized'] = rating_avg['ratings'] - rating_avg['mean']
# print(rating_avg.head())
user_movie = pd.pivot_table(rating_avg, values='normalized', index='userId', columns='movieId')
user_movie.head()

movieId,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,1.096154,0.096154,,,,,,,,,...,,,,,,,,,,
79,,,,,,,,,,,...,,,,,,,,,,
199,,,,,,,,,,0.056338,...,,,,,,,,,,
481,,,,,,,,,,0.648649,...,,,,,,,,,,
769,,,,,,,,,,,...,,,,,,,,,,


In [6]:
user_movie.fillna(0, inplace = True)

In [None]:
weights = np.dot(user_movie, user_movie.T) / (np.linalg.norm(user_movie) * np.linalg.norm(user_movie.T))
np.fill_diagonal(weights, 1)
weight_matrix = pd.DataFrame(weights, index = user_movie.index, columns = user_movie.index)

In [8]:
def get_nearest_neighbours(k, userId, movieId):
    if k == 0:
      return weight_matrix.loc[userId].to_numpy(), user_movie[movieId].to_numpy()
    else:
      sorted_users = weight_matrix.loc[userId].sort_values(ascending=False).iloc[1:k+1]
      k_users = sorted_users.index.to_numpy()
      k_ratings = np.array([user_movie[movieId].loc[user] for user in k_users])
      return k_users, k_ratings

In [3]:
testing_data = pd.read_csv('./dataset/test.txt', names=['movieId', 'userId', 'rating'])
y_actual = np.array(testing_data['rating'])   

In [22]:
def init_testing(neighbors = 0): 
    y_predicted = []
    for _, r in testing_data.iterrows():
      test_user_id = int(r['userId'])
      test_movie_id = int(r['movieId'])
      v_a_bar = rating_avg.loc[rating_avg['userId'] == test_user_id]['mean'].unique()[0]
      active_row, all_ratings = get_nearest_neighbours(neighbors, test_user_id, test_movie_id)
      weight_summation = np.sum(active_row)
      if int(weight_summation) == 0:
        y_predicted.append(v_a_bar)
        continue
      kapa = 1 / weight_summation
      predicted_rating = v_a_bar + (kapa * (np.dot(active_row, all_ratings.T)))
      y_predicted.append(predicted_rating)
    return y_predicted

In [None]:
for i in [0, 10, 50, 150, 500]:
  num = 'all' if i == 0 else i
  y_predicted = init_testing(i) 
  print(f'\nBased on {num} nearest users: ')
  print('RMSE', rmse(y_predicted, y_actual))
  print('MAE', mae(y_predicted, y_actual))


Based on 500 nearest users: 
RMSE 0.9040858861520865
 MAE 0.7082903817830285
