In [1]:
import numpy as np
import pandas as pd
import matrix_factorization_utilities

In [2]:
# Load user ratings
raw_training_dataset_df = pd.read_csv('movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('movie_ratings_data_set_testing.csv')

In [3]:
# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

In [4]:
ratings_testing_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,,,...,,4.0,,,,,,,,
2,,5.0,,,,,,,,,...,,,,,,,3.0,,,
3,,4.0,,,,,,,,,...,,,,,,,,,,
4,5.0,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,3.0,,,


In [5]:
ratings_training_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,4.0,,...,,,,4.0,,,,,,
2,5.0,,,,,,,,,,...,,,,,,,,,,4.0
3,4.0,,5.0,,,,,,,,...,,,,,,,,,,
4,,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,,2.0,5.0,5.0


In [6]:
# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_training_df.values,
                                                                    num_features=11,
                                                                    regularization_amount=1.1)

Optimization terminated successfully.
         Current function value: 315.538580
         Iterations: 925
         Function evaluations: 1384
         Gradient evaluations: 1384


In [7]:
# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

In [8]:
predicted_ratings

array([[3.91681891, 3.96473429, 4.42144255, ..., 3.00133447, 3.95892947,
        4.33805727],
       [4.4892842 , 4.07366296, 4.55142011, ..., 3.5080002 , 4.89237554,
        4.02297983],
       [4.04970203, 3.83420587, 4.54811881, ..., 2.52469697, 4.15501385,
        4.12065729],
       ...,
       [3.59429996, 3.5501221 , 3.97673753, ..., 4.97994534, 4.25190545,
        4.556711  ],
       [1.81938674, 1.85115072, 2.01821412, ..., 4.65601928, 2.99133474,
        2.99448084],
       [1.98890161, 1.51207129, 1.84546903, ..., 3.01078199, 2.9003933 ,
        1.93889992]])

In [9]:
# Measure RMSE
rmse_training = matrix_factorization_utilities.RMSE(ratings_training_df.values,
                                                    predicted_ratings)
rmse_testing = matrix_factorization_utilities.RMSE(ratings_testing_df.values,
                                                   predicted_ratings)

In [10]:
print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))

Training RMSE: 0.24952551170485054
Testing RMSE: 1.2096503644962215
