In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from recsys_utils import load_Book_List_pd

In [7]:
bookList, bookList_df = load_Book_List_pd()

my_ratings = np.zeros(11128)          #  Initialize my ratings

# For example, Harry Potter and the Half-Blood Prince (Harry Potter  #6) has ID 1, so to rate it "5", you can set
my_ratings[1] = 5 

#Or suppose you did not enjoy Persuasion (2007), you can set
my_ratings[2] = 2;

# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[4]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[5]  = 5   # Shrek (2001)
my_ratings[8] = 3   # Inception
my_ratings[9] = 5   # Incredibles, The (2004)
my_ratings[10]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[12]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[13]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[14]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[16] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[18] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[21]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {bookList_df.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Harry Potter and the Half-Blood Prince (Harry Potter  #6)
Rated 2.0 for  Harry Potter and the Order of the Phoenix (Harry Potter  #5)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (Harry Potter  #2)
Rated 5.0 for  Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)
Rated 3.0 for  Harry Potter Boxed Set  Books 1-5 (Harry Potter  #1-5)
Rated 5.0 for  Unauthorized Harry Potter Book Seven News: "Half-Blood Prince" Analysis and Speculation
Rated 2.0 for  Harry Potter Collection (Harry Potter  #1-6)
Rated 5.0 for  The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)
Rated 5.0 for  The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)
Rated 3.0 for  The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)
Rated 1.0 for  The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)
Rated 1.0 for  The Ultimate Hitchhiker's 

In [12]:
def create_user_ratings_matrices(books_df, n_synthetic_users=100):
    n_books = len(books_df)
    
    # Create matrices
    Y = np.zeros((n_books, n_synthetic_users))
    R = np.zeros((n_books, n_synthetic_users))
    
    for book_idx in range(n_books):
        avg_rating = books_df.loc[book_idx, 'average_rating']
        n_ratings = books_df.loc[book_idx, 'ratings_count']
        
        if pd.notna(avg_rating) and n_ratings > 0:
            # Generate synthetic ratings around the average rating
            # for a random subset of users
            n_users_who_rated = min(int(n_ratings/100), n_synthetic_users)  # Scale down the number
            rating_users = np.random.choice(n_synthetic_users, n_users_who_rated, replace=False)
            
            # Generate ratings with some noise around the average
            synthetic_ratings = np.random.normal(avg_rating, 0.5, n_users_who_rated)
            # Clip ratings to be between 1 and 5
            synthetic_ratings = np.clip(synthetic_ratings, 1, 5)
            
            # Assign the ratings
            Y[book_idx, rating_users] = synthetic_ratings
            R[book_idx, rating_users] = 1
    
    return Y, R



In [13]:
# Usage:
import pandas as pd
books_df = pd.read_csv('books.csv')
Y, R = create_user_ratings_matrices(books_df)

# Save the matrices for future use
np.savetxt('books_Y.csv', Y, delimiter=',')
np.savetxt('books_R.csv', R, delimiter=',')

In [23]:
def get_books_users_features_count(Y, num_features=100):
    """
    Computes the number of books, users, and features from the ratings matrix
    
    Args:
        Y (ndarray): Ratings matrix where rows are books and columns are users
        num_features (int): Number of latent features (default=100)
        
    Returns:
        tuple: (num_books, num_users, num_features)
    """
    num_books, num_users = Y.shape
    
    print(f"Number of books: {num_books}")
    print(f"Number of users: {num_users}")
    print(f"Number of features: {num_features}")
    print(f"Total number of ratings: {np.count_nonzero(Y)}")
    print(f"Sparsity: {(1 - np.count_nonzero(Y)/(num_books*num_users))*100:.2f}%")
    
    return num_books, num_users, num_features

# Usage:
num_books, num_users, num_features = get_books_users_features_count(Y)

# You can also specify a different number of features:
# num_books, num_users, num_features = get_books_users_features_count(Y, num_features=50)


Number of books: 11128
Number of users: 105
Number of features: 100
Total number of ratings: 321729
Sparsity: 72.47%


In [15]:
# Pad Y with zeros to match my_ratings size
padding_size = 11128 - Y.shape[0]
if padding_size > 0:
    Y = np.pad(Y, ((0, padding_size), (0, 0)), mode='constant')
    R = np.pad(R, ((0, padding_size), (0, 0)), mode='constant')

# Then concatenate
Y = np.c_[my_ratings, Y]
R = np.c_[(my_ratings != 0).astype(int), R]


In [18]:
from recsys_utils import *


In [20]:
def normalizeRatings(Y, R):
    """
    Normalize Y by subtracting the mean of each movie's ratings
    Args:
      Y (ndarray (num_movies,num_users)): The utility matrix
      R (ndarray (num_movies,num_users)): Indicator matrix for Y
    Returns:
      Ynorm (ndarray (num_movies,num_users)): Normalized Y utility matrix
      Ymean (ndarray (num_movies,1)): Mean rating for each movie
    """
    Ymean = (np.sum(Y*R, axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

In [21]:
Y    = np.c_[my_ratings, Y]
R    = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [24]:
print(Y.shape)

(11128, 105)


In [25]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_books, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [28]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [29]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 17214531.6
Training loss at iteration 20: 448773.1
Training loss at iteration 40: 183847.4
Training loss at iteration 60: 113176.6
Training loss at iteration 80: 80312.5
Training loss at iteration 100: 61577.4
Training loss at iteration 120: 49255.2
Training loss at iteration 140: 40278.6
Training loss at iteration 160: 33278.0
Training loss at iteration 180: 27614.9


In [30]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for Book {bookList[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {bookList[i]}')

Predicting rating 5.62 for Book The Mysterious Affair at Styles (Hercule Poirot  #1)
Predicting rating 5.14 for Book Forgiven (Firstborn  #2)
Predicting rating 5.07 for Book The Monster at the End of this Book
Predicting rating 4.98 for Book Fullmetal Alchemist  Vol. 10
Predicting rating 4.94 for Book Why Are All The Black Kids Sitting Together in the Cafeteria?: A Psychologist Explains the Development of Racial Identity
Predicting rating 4.94 for Book Death Note  Vol. 1: Boredom (Death Note  #1)
Predicting rating 4.89 for Book Found (Firstborn  #3)
Predicting rating 4.89 for Book Nineteen Eighty-Four
Predicting rating 4.87 for Book Exodus
Predicting rating 4.86 for Book The Book Thief
Predicting rating 4.85 for Book The Shining


Original vs Predicted ratings:

Original 5.0, Predicted 4.93 for Harry Potter and the Order of the Phoenix (Harry Potter  #5)
Original 2.0, Predicted 2.10 for Harry Potter and the Chamber of Secrets (Harry Potter  #2)
Original 5.0, Predicted 4.91 for Harry Po