In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
# https://grouplens.org/datasets/movielens/latest/
ratings_df = pd.read_csv('./ratings.csv', delimiter=',')

In [None]:
unique_movie_ids = ratings_df['movieId'].unique()
unique_user_ids = ratings_df['userId'].unique()
movie_id_to_index = {mid: i for i, mid in enumerate(unique_movie_ids)}
user_id_to_index = {uid: i for i, uid in enumerate(unique_user_ids)}
num_movies = len(unique_movie_ids)
num_users = len(unique_user_ids)

In [None]:
# Compute ratings matrices Y and R
# Y[i, j] contains the rating user j gave movie i
# R[i, j] contains 1 if user j rated the movie i
Y = np.zeros((num_movies, num_users))
R = np.zeros((num_movies, num_users))
for _, row in ratings_df.iterrows():
    i = movie_id_to_index[row['movieId']]
    j = user_id_to_index[row['userId']]
    Y[i, j] = row['rating']
    R[i, j] = 1

In [None]:
def normalizeY(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeY(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

In [None]:
def cost(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [None]:
my_ratings = np.zeros(num_movies)
my_ratings[movie_id_to_index[83132]] = 5
my_ratings[movie_id_to_index[5989]] = 5
my_ratings[movie_id_to_index[60487]] = 5
my_ratings[movie_id_to_index[26133]] = 5
my_ratings[movie_id_to_index[26555]] = 5
my_ratings[movie_id_to_index[65261]] = 5
my_ratings[movie_id_to_index[59315]] = 1
my_ratings[movie_id_to_index[4954]] = 1
my_ratings[movie_id_to_index[89745]] = 1
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

In [None]:
movies_df = pd.read_csv('./movies.csv', delimiter=',', quotechar='"')

In [None]:
index_to_movie_id = {i: mid for mid, i in movie_id_to_index.items()}
movie_titles = movies_df.set_index('movieId')['title'].to_dict()

In [None]:
def getTitle(index):
    movie_id = index_to_movie_id[index]
    return movie_titles.get(movie_id, "Unknown Movie")

In [None]:
print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for {getTitle(i)}');

In [None]:
# Add new user ratings to Y  and R
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]
Ynorm, Ymean = normalizeY(Y, R)

In [None]:
# Initialize model parameters
num_movies, num_users = Y.shape
num_features = 100
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

In [None]:
# Train the model
optimizer = keras.optimizers.Adam(learning_rate=0.1)
lambda_ = 1
iterations = 200
previous_cost = float('inf')
for iter in range(iterations):
    # Use TensorFlow's Gradient Tape 
    # to record the steps used to compute the cost
    with tf.GradientTape() as tape:
        # Compute the cost (forward pass is included in cost)
        cost_value = cost(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve 
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X, W, b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X, W, b]) )
    
    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")
    if abs(cost_value - previous_cost) < 1:
        break
    previous_cost = cost_value
print(f"Finished. Training loss at iteration {iter}: {cost_value:0.1f}")

In [None]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

print('\n\nTop recommendations:\n')
for i in range(17):
    j = int(ix[i])
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movies_df.loc[j, "title"]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {getTitle(i)}')

In [None]:
# I thought the max rating was supposed to be 5, but this shows the model isn't constrained like the inputs

In [None]:
i = movie_id_to_index[7361]
print(f'{my_predictions[i]:.2f} for {getTitle(i)}')
i = movie_id_to_index[2394]
print(f'{my_predictions[i]:.2f} for {getTitle(i)}')
i = movie_id_to_index[44199]
print(f'{my_predictions[i]:.2f} for {getTitle(i)}')
i = movie_id_to_index[48394]
print(f'{my_predictions[i]:.2f} for {getTitle(i)}')