In [1]:
import pandas as pd
from scipy.sparse import csr_matrix

# Load the dataset
ratings = pd.read_csv('/home1/chenpaul/EE557/MovieLens/rating.csv')


In [2]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Create the training set user-item matrix
train_data_matrix = csr_matrix((train_data['rating'], (train_data['userId'], train_data['movieId'])))

# Create the test set user-item matrix
test_data_matrix = csr_matrix((test_data['rating'], (test_data['userId'], test_data['movieId'])))

# Get the unique users in the train set
unique_users_train = train_data['userId'].unique()
print(f"Unique users in the train set: {len(unique_users_train)}")

# Get the unique users in the test set
unique_users_test = test_data['userId'].unique()
print(f"Unique users in the test set: {len(unique_users_test)}")

# Print the actual user IDs
print("User IDs in the train set:")
print(unique_users_train)
print("\nUser IDs in the test set:")
print(unique_users_test)

Unique users in the train set: 138493
Unique users in the test set: 138339
User IDs in the train set:
[50069 97443  3743 ... 13245 44533 40869]

User IDs in the test set:
[122270  49018  89527 ...   6149  75769 138204]


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
# Function to calculate RMSE
def calculate_rmse(predictions, user_index, test_matrix, test_movies_indices):
    # predictions = np.array(predictions)[test_movies_indices]
    # Ground truth ratings for the user in the test set
    ground_truth_ratings = test_matrix[user_index, test_movies_indices].toarray()[0]
    # Calculate RMSE
    rmse_value = sqrt(mean_squared_error(ground_truth_ratings, predictions))

    return rmse_value

user-based collaborative filtering

In [31]:
def predict_single_user_ratings(user_index, train_matrix, test_matrix):
    # Identify the indices of movies that the user has rated in the test set
    test_movies_indices = test_matrix[user_index].nonzero()[1]
    
    # Initialize predictions array with zeros
    predictions = np.zeros(test_matrix.shape[1])
    
    # Initialize a list to store similarities for later use
    user_similarities = []
    
    # For each movie that the user has rated in the test set
    for movie_idx in test_movies_indices:
        # Find all users who have rated this movie
        users_who_rated_movie = train_matrix[:, movie_idx].nonzero()[0]
        # If no other users have rated this movie, we skip it
        if users_who_rated_movie.size == 0:
            continue
        # Compute cosine similarity only between the user and the users who have rated this movie
        user_similarity = cosine_similarity(train_matrix[user_index, :],
                                            train_matrix[users_who_rated_movie, :]).flatten()
        user_similarities.append(user_similarity)
        
        # Get the ratings given by all users to this movie
        movie_ratings = train_matrix[users_who_rated_movie, movie_idx].toarray().flatten()
        
        # Compute the predicted rating as a weighted average where weights are the cosine similarities
        numerator = np.dot(movie_ratings, user_similarity)
        denominator = np.sum(np.abs(user_similarity)) + 1e-10  # Added a small number to avoid division by zero
        
        # Store the predicted rating
        predictions[movie_idx] = numerator / denominator
    
    # In this context, we return only the predictions for the movies the user has rated in the test set
    return predictions[test_movies_indices], test_movies_indices, user_similarities

In [32]:
for i in range(5):
    user_index = unique_users_test[i]
    predictions, test_movies_indices, user_similarities = predict_single_user_ratings(user_index, train_data_matrix, test_data_matrix)
    user_rmse = calculate_rmse(predictions, user_index, test_data_matrix, test_movies_indices)

    print(f'RMSE for user {user_index}: {user_rmse}')

RMSE for user 122270: 0.722712485880963
RMSE for user 49018: 1.6313820086221362
RMSE for user 89527: 0.8427505829969371
RMSE for user 106704: 0.8258437295243516
RMSE for user 47791: 1.7818391114192897


Item-based collaborative filtering

In [29]:
def predict_item_ratings_for_user(user_index, train_matrix, test_matrix, default_rating=0):
    test_items_indices = test_matrix[:,user_index].nonzero()[0]
    predictions = np.zeros(len(test_items_indices))
    # predictions = np.full(len(test_items_indices), default_rating)  # Fill with default_rating
    # print (test_items_indices)
    for idx, item_index in enumerate(test_items_indices):
        users_who_rated_item = train_matrix[item_index, :].nonzero()[1]
        # print("item index", item_index)
        # print("users_who_rated_item", users_who_rated_item)

        if len(users_who_rated_item) == 0:
            continue

        item_similarity = cosine_similarity(train_matrix[item_index, :].reshape(1, -1), train_matrix).flatten()
        # print (item_similarity)
        numerator = np.dot(item_similarity, train_matrix[:, item_index].toarray().flatten())
        denominator = np.sum(np.abs(item_similarity))
        # print (numerator, denominator)
        # if denominator != 0:
        predictions[idx] = numerator / denominator
        # print(idx, predictions[idx])

    return predictions, test_items_indices


In [30]:
for i in range(5):
    user_index = unique_users_test[i]
    predictions, test_movies_indices = predict_item_ratings_for_user(user_index, train_data_matrix.T, test_data_matrix.T)
    # print(predictions)
    user_rmse = calculate_rmse(predictions, user_index, train_data_matrix, test_movies_indices)

    print(f'RMSE for user {user_index}: {user_rmse}')

RMSE for user 122270: 0.14352068663892514
RMSE for user 49018: 0.13947555259321676
RMSE for user 89527: 0.15071550077281856
RMSE for user 106704: 0.1346243672303745
RMSE for user 47791: 0.1433481878133586
