In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [4]:
# Load the data
movies = pd.read_csv('../data/movies/movies.csv')
train_ratings = pd.read_csv('../data/movies/train.csv')
test_ratings = pd.read_csv('../data/movies/test.csv')
users = pd.read_csv('../data/movies/users.csv')

In [5]:
# Ensure column names are consistent
train_ratings.columns = ['uID', 'mID', 'rating']
test_ratings.columns = ['uID', 'mID', 'rating']

# Determine the number of unique users and movies
n_users = max(train_ratings.uID.max(), test_ratings.uID.max())
n_movies = max(train_ratings.mID.max(), test_ratings.mID.max())

In [6]:
# Create a user-item interaction matrix
user_item_matrix = np.zeros((n_users, n_movies))
for line in train_ratings.itertuples():
    user_item_matrix[line[1]-1, line[2]-1] = line[3]

user_item_matrix = csr_matrix(user_item_matrix)

# Apply NMF with increased max_iter to improve convergence
n_components = 20
nmf_model = NMF(n_components=n_components, init='random', random_state=42, max_iter=1000)
W = nmf_model.fit_transform(user_item_matrix)
H = nmf_model.components_

# Predict the missing ratings
predicted_ratings = np.dot(W, H)

# Extract the predicted ratings for the test set
test_ratings['predicted_rating'] = test_ratings.apply(lambda row: predicted_ratings[int(row.uID)-1, int(row.mID)-1], axis=1)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_ratings.rating, test_ratings.predicted_rating))
print(f'RMSE: {rmse}')

RMSE: 2.850333668998515
