In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from recsys_utils import *
import pandas as pd

In [3]:
df_R = pd.read_pickle('R.pkl')
R=df_R.to_numpy()

In [4]:
df_Y = pd.read_pickle('Y.pkl')
Y=df_Y.to_numpy()

In [5]:
df_X = pd.read_pickle('X.pkl')
X=df_X.to_numpy()

In [6]:
df_W = pd.read_pickle('W.pkl')
W=df_W.to_numpy()

In [7]:
df_b = pd.read_pickle('b.pkl')
b=df_b.to_numpy()

In [8]:
num_movies, num_features = X.shape
num_users,_ = W.shape

In [9]:
#Load data
# X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
# Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (6000, 7000) R (6000, 7000)
X (6000, 20)
W (7000, 20)
b (1, 7000)
num_features 20
num_movies 6000
num_users 7000


In [10]:
#  From the matrix, we can compute statistics like average rating.
tsmean =  np.mean(Y[0, R[0, :].astype(bool)])
print(f"Average rating for movie 1 : {tsmean:0.3f} / 5" )

Average rating for movie 1 : 3.951 / 5


In [11]:
# Reduce the data set size so that this runs faster
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

In [12]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [13]:
# Evaluate cost function
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization 
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 0.00
Cost (with regularization): 6.42


In [16]:
df_m = pd.read_csv('final_movies.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
df_r = pd.read_csv('final_ratings.csv', header=0, index_col=0)
mlist = df_m["title"].to_list()
df_m

Unnamed: 0,mean rating,number of ratings,title,movieId,War,Fantasy,Adventure,Horror,Documentary,Mystery,...,Comedy,Western,Animation,(no genres listed),Crime,Musical,Thriller,Sci-Fi,Action,Film-Noir
0,4.031707,4037,Persuasion (1995),28,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.683938,193,Lamerica (1994),53,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.354105,1693,Fair Game (1995),71,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3.583685,711,Once Upon a Time... When We Were Colored (1995),83,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.069737,28242,Broken Arrow (1996),95,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,2.500000,1,Bottle (2011),208857,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5996,0.000000,0,The Magic Book (1900),208923,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5997,4.000000,2,The Prep School Negro (2012),209063,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5998,3.500000,2,Last Days of the Arctic (2011),209131,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# mlist, df_m, df_r = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)          #  Initialize my ratings
df_qu=df_r.query('userId == 1')
print(df_qu)
for i in range(len(df_qu.index.to_list())):
    ori_id=df_qu.iloc[i]['movieId']
    df_qu2 = df_m.query(f'movieId == {ori_id}')
    id_list=df_qu2.index.to_list()
    movie=id_list[0]
    rating=df_qu.iloc[i]['rating']
    my_ratings[movie]= rating
# user=1
# df.query()

# # Check the file small_movie_list.csv for id of each movie in our dataset
# # For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
# my_ratings[2700] = 5 

# #Or suppose you did not enjoy Persuasion (2007), you can set
# my_ratings[2609] = 2;

# # We have selected a few movies we liked / did not like and the ratings we
# # gave are as follows:
# my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
# my_ratings[246]  = 5   # Shrek (2001)
# my_ratings[2716] = 3   # Inception
# my_ratings[1150] = 5   # Incredibles, The (2004)
# my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
# my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
# my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
# my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
# my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
# my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
# my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
# my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {df_m.loc[i,"title"]}');

    userId  movieId  rating   timestamp
2        1      307     5.0  1147868828
9        1     1250     4.0  1147868414
10       1     1260     3.5  1147877857
11       1     1653     4.0  1147868097
30       1     5147     4.0  1147877654
63       1     8873     3.0  1147869094
64       1     8973     4.0  1147869211
68       1    31956     3.5  1147877610

New user ratings:

Rated 5.0 for  Three Colors: Blue (Trois couleurs: Bleu) (1993)
Rated 4.0 for  Bridge on the River Kwai, The (1957)
Rated 3.5 for  M (1931)
Rated 4.0 for  Gattaca (1997)
Rated 4.0 for  Wild Strawberries (Smultronstället) (1957)
Rated 3.0 for  Motorcycle Diaries, The (Diarios de motocicleta) (2004)
Rated 4.0 for  Bad Education (La mala educación) (2004)
Rated 3.5 for  5x2 (2004)


In [26]:
# Reload ratings
# Y, R = load_ratings_small()

# Add new user ratings to Y
# Y = np.c_[my_ratings, Y]

# Add new user indicator matrix to R
# R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
Ymean.astype(np.float16)
Ynorm = Y - np.multiply(Ymean, R)
Ynorm.astype(np.float16)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [27]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [28]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

ResourceExhaustedError: {{function_node __wrapped__Pow_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[6000,65000] and type double on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:Pow]

In [28]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]
movies_to_send= []

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(25):
    j = ix[i]
    if j not in my_rated:
        movies_to_send.append(movieList[j])
        print(movieList[j])

My Sassy Girl (Yeopgijeogin geunyeo) (2001)
Martin Lawrence Live: Runteldat (2002)
Memento (2000)
Delirium (2014)
Laggies (2014)
One I Love, The (2014)
Particle Fever (2013)
Eichmann (2007)
Battle Royale 2: Requiem (Batoru rowaiaru II: Chinkonka) (2003)
Into the Abyss (2011)
Son of the Bride (Hijo de la novia, El) (2001)
Rivers and Tides (2001)
George Carlin: It's Bad for Ya! (2008)
Loving Vincent (2017)
PK (2014)
Sisters (Syostry) (2001)
What Men Talk About (2010)
George Carlin: Life Is Worth Losing (2005)
