In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import load_data
import clean_data
import custom_matrix_factorization

In [2]:
# read small (100k) ratings into a pandas data frame
ratings = load_data.load_small_ratings()

In [3]:
print(ratings.head())
print(ratings.shape)

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
(100836, 4)


In [4]:
n_users = ratings.userId.unique().shape[0]
print(n_users)

610


In [5]:
n_movies = ratings.movieId.unique().shape[0]
print(n_movies)

9724


In [6]:
# create the complete matrix (as Numpy array) of user x movie ratings
complete_ratings_matrix = clean_data.complete_ratings_matrix(ratings)
print(type(complete_ratings_matrix))
print(complete_ratings_matrix)
print(complete_ratings_matrix.shape)

<class 'numpy.ndarray'>
[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]
(610, 9724)


In [11]:
# calculate user similarity
user_similarity = pairwise_distances(complete_ratings_matrix, metric='cosine')
item_similarity = pairwise_distances(complete_ratings_matrix.T, metric='cosine')
print(user_similarity)
print(item_similarity)

[[0.         0.97271713 0.94027974 ... 0.70890263 0.90642807 0.85467919]
 [0.97271713 0.         1.         ... 0.95378905 0.9724346  0.89757325]
 [0.94027974 1.         0.         ... 0.97887154 1.         0.96788125]
 ...
 [0.70890263 0.95378905 0.97887154 ... 0.         0.87800729 0.67794514]
 [0.90642807 0.9724346  1.         ... 0.87800729 0.         0.94677454]
 [0.85467919 0.89757325 0.96788125 ... 0.67794514 0.94677454 0.        ]]
[[0.         0.58943794 0.7030831  ... 1.         1.         1.        ]
 [0.58943794 0.         0.71756201 ... 1.         1.         1.        ]
 [0.7030831  0.71756201 0.         ... 1.         1.         1.        ]
 ...
 [1.         1.         1.         ... 0.         0.         1.        ]
 [1.         1.         1.         ... 0.         0.         1.        ]
 [1.         1.         1.         ... 1.         1.         0.        ]]


In [14]:
# use the complete matrix to create a custom_matrix_factorization object
K = 20
alpha = 0.001
beta = 0.01
iterations = 100
custom_mf = custom_matrix_factorization.custom_matrix_factorization(complete_ratings_matrix, K=K, alpha=alpha, beta=beta, iterations=iterations)
print(custom_mf.R)

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


In [15]:
custom_mf.R[np.where(custom_mf.R != 0)]

array([4., 4., 4., ..., 5., 5., 3.])