In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error

In [2]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
# Create a dictionary to map the movie ID to the index
users = list(MV_users['uID'])
movies = list(MV_movies['mID'])

mv_idx_dict = dict(zip(MV_movies.mID,list(range(len(MV_movies)))))
usr_idx_dict = dict(zip(MV_users.uID,list(range(len(MV_users)))))

# Create the rating matrix
mv_idxs = [mv_idx_dict[x] for x in train.mID] 
usr_idxs = [usr_idx_dict[x] for x in train.uID]

rating_train = list(train.rating)
Mr = np.array(coo_matrix((rating_train, (usr_idxs, mv_idxs)), shape=(len(users), len(movies))).toarray())

In [5]:
Mr.shape    

(6040, 3883)

In [6]:
sparsity = len(Mr.nonzero()[0]) / float(Mr.shape[0] * Mr.shape[1])
print("Sparsity: %.6f" % sparsity)

Sparsity: 0.029853


In [7]:
# train NMF model
model = NMF(n_components=20, init='random', random_state=0)
W = model.fit_transform(Mr)
H = model.components_

In [8]:
print(W.shape)
print(H.shape)

(6040, 20)
(20, 3883)


In [9]:
ind_movie_test = [mv_idx_dict[x] for x in test.mID] 
ind_user_test = [usr_idx_dict[x] for x in test.uID]
rating_test = list(test.rating)
Mr_test = np.array(coo_matrix((rating_test, (ind_user_test, ind_movie_test)), shape=(len(users), len(movies))).toarray())

In [10]:
Mr_test.shape

(6040, 3883)

In [11]:
# predict
sparsity_test = len(Mr_test.nonzero()[0]) / float(Mr_test.shape[0] * Mr_test.shape[1])
print("Sparsity: %.6f" % sparsity_test)
Mr_pred = np.dot(W,H)
print(Mr_pred.shape)

Sparsity: 0.012794
(6040, 3883)


In [12]:
rmse = np.sqrt(mean_squared_error(Mr_test[Mr_test.nonzero()].flatten(), Mr_pred[Mr_test.nonzero()].flatten()))
print(rmse)

2.861970909347181


Discuss the results and why they did not work well compared to simple baseline or similarity-based methods we’ve done in Module 3. Can you suggest a way(s) to fix it?

*   Matrix factorization is difficult on this dataset since the data is very sparse
*   The RMSE turned out to be around 2.86, which is abysmal compared to the recommender system from Module 3 
*   Changing n_components or using a different loss function might produce better results
*   KL loss would be a solid choice of a loss function since the matrix contains lots of zeroes but is also sensitive to sparse matrices.
