In [1]:
import numpy as np
import pandas as pd
import pickle
import matrix_factorization_utilities

In [2]:
# Load user ratings
raw_dataset_df = pd.read_csv('movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

In [3]:
ratings_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,4.0,,...,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,...,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,...,,,,,,,,,,
4,5.0,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,3.0,2.0,5.0,5.0


In [4]:
# Normalize the ratings (center them around their mean)
normalized_ratings, means = matrix_factorization_utilities.normalize_ratings(ratings_df.values)
normalized_ratings

array([[-0.61111111,         nan,         nan, ...,         nan,
                nan,         nan],
       [ 0.38888889,  0.38888889,         nan, ...,         nan,
                nan, -0.11111111],
       [-0.61111111, -0.61111111,  0.18181818, ...,         nan,
                nan,         nan],
       ...,
       [        nan,         nan,         nan, ...,  0.95348837,
         0.24390244,  0.88888889],
       [        nan,         nan,         nan, ...,  0.95348837,
        -0.75609756, -1.11111111],
       [        nan,         nan,         nan, ..., -1.04651163,
        -0.75609756,  0.88888889]])

In [5]:
means

array([4.61111111, 4.61111111, 4.81818182, 4.42857143, 4.78571429,
       4.9       , 4.        , 4.6       , 4.7       , 4.88235294,
       4.42857143, 4.8       , 4.64285714, 4.52173913, 4.3       ,
       4.71428571, 2.83333333, 3.16666667, 3.94117647, 3.81081081,
       4.07317073, 4.31578947, 3.76190476, 4.68421053, 4.58333333,
       4.625     , 4.30769231, 4.53333333, 3.65517241, 3.36      ,
       3.175     , 4.04651163, 3.75609756, 4.11111111])

In [6]:
# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(normalized_ratings,
                                                                    num_features=11,
                                                                    regularization_amount=1.1)

Optimization terminated successfully.
         Current function value: 105.620378
         Iterations: 520
         Function evaluations: 776
         Gradient evaluations: 776


In [7]:
# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)
predicted_ratings

array([[-3.42720903e-01, -3.14565723e-01, -1.61245491e-01, ...,
         3.28455134e-01, -3.12700287e-01, -2.46638697e-01],
       [ 2.50301570e-01,  2.25572272e-01,  1.70867519e-01, ...,
         2.32666971e-01, -3.72564492e-04, -4.20943498e-02],
       [-4.23853197e-01, -4.11414251e-01,  1.04962864e-01, ...,
        -4.82965244e-01, -1.01628739e+00,  4.92188998e-01],
       ...,
       [-2.99688966e-02,  8.01801250e-02, -2.13967755e-01, ...,
         8.48127965e-01,  2.72114826e-01,  7.71856308e-01],
       [ 2.03218282e-01, -1.34835910e-01,  1.63999470e-01, ...,
         7.70452638e-01, -6.72092467e-01, -9.17883856e-01],
       [ 8.84836294e-02, -7.04439090e-01,  3.10212463e-01, ...,
        -1.04366343e+00, -7.86083327e-01,  8.42776823e-01]])

In [8]:
# Add back in the mean ratings for each product to de-normalize the predicted results
predicted_ratings = predicted_ratings + means
predicted_ratings

array([[4.26839021, 4.29654539, 4.65693633, ..., 4.37496676, 3.44339727,
        3.86447241],
       [4.86141268, 4.83668338, 4.98904934, ..., 4.2791786 , 3.755725  ,
        4.06901676],
       [4.18725791, 4.19969686, 4.92314468, ..., 3.56354638, 2.73981017,
        4.60330011],
       ...,
       [4.58114221, 4.69129124, 4.60421406, ..., 4.89463959, 4.02821239,
        4.88296742],
       [4.81432939, 4.4762752 , 4.98218129, ..., 4.81696427, 3.08400509,
        3.19322725],
       [4.69959474, 3.90667202, 5.12839428, ..., 3.0028482 , 2.97001423,
        4.95388793]])

In [9]:
# Save features and predicted ratings to files for later use
pickle.dump(U, open("user_features_first.dat", "wb"))
pickle.dump(M, open("product_features_first.dat", "wb"))
pickle.dump(predicted_ratings, open("predicted_ratings_first.dat", "wb" ))
pickle.dump(means, open("means.dat", "wb" ))