Dataset: https://grouplens.org/datasets/movielens/1m/ 

In [8]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import warnings

data_df = pd.read_csv('./ratings.dat', sep='::', names=["UserID", "MovieID", "Rating", "Timestamp"], engine='python')

# First, generate dictionaries for mapping old id to new id for users and movies
unique_MovieID = data_df['MovieID'].unique()
unique_UserID = data_df['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    j += 1
j = 0
movie_old2new_id_dict = dict()
for i in unique_MovieID:
    movie_old2new_id_dict[i] = j
    j += 1

# Then, use the generated dictionaries to reindex UserID and MovieID in the data_df
user_list = data_df['UserID'].values
movie_list = data_df['MovieID'].values
for j in range(len(data_df)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    movie_list[j] = movie_old2new_id_dict[movie_list[j]]
data_df['UserID'] = user_list
data_df['movieID'] = movie_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(data_df)) <= 0.7
train_df = data_df[train_index]
test_df = data_df[~train_index]

# generate train_mat and test_mat
num_user = len(data_df['UserID'].unique())
num_movie = len(data_df['MovieID'].unique())

train_mat = coo_matrix((train_df['Rating'].values, (train_df['UserID'].values, train_df['MovieID'].values)), shape=(num_user, num_movie)).astype(float).toarray()
test_mat = coo_matrix((test_df['Rating'].values, (test_df['UserID'].values, test_df['MovieID'].values)), shape=(num_user, num_movie)).astype(float).toarray()

In [9]:
# Baseline Estimation Model



In [10]:
# calculate the prediction_mat by the baseline estimation recommendation algorithm
# Your Code Here...
# initialize prediction_mat with zeros
prediction_mat = np.zeros((num_user, num_movie))

# overall mean rating
overall_mean_rating = np.mean(train_mat[train_mat != 0])

# mean ratings for users and items
user_means = np.zeros(num_user)
for user_idx in range(num_user):
    user_ratings = train_mat[user_idx, :]
    user_ratings = user_ratings[user_ratings != 0]
    if len(user_ratings) > 0:
        user_means[user_idx] = np.mean(user_ratings) - overall_mean_rating

item_means = np.zeros(num_movie)
for item_idx in range(num_movie):
    item_ratings = train_mat[:, item_idx]
    item_ratings = item_ratings[item_ratings != 0]
    if len(item_ratings) > 0:
        item_means[item_idx] = np.mean(item_ratings) - overall_mean_rating

# baseline estimates for each user-movie pair
for user_idx in range(num_user):
    for movie_idx in range(num_movie):
        prediction_mat[user_idx, movie_idx] = overall_mean_rating + user_means[user_idx] + item_means[movie_idx]


In [11]:
#import 
from sklearn.metrics import mean_squared_error
# calculate and print out the RMSE for your prediction_df and the test_df
# Your Code Here...
non_zero_mask = test_mat != 0

# Flatten the prediction matrix and test matrix using the non-zero mask
prediction_flat = prediction_mat[non_zero_mask]
test_flat = test_mat[non_zero_mask]

# Calculate RMSE
rmse = mean_squared_error(test_flat, prediction_flat, squared=False)

# Print RMSE
print(f"Root Mean Square Error (RMSE) for Baseline Estimate Model: {rmse}")


Root Mean Square Error (RMSE) for Baseline Estimate Model: 0.935792164386952


