# Collaborative filtering 
## movie recommendation model

In [61]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from numpy import loadtxt

In [62]:
def get_movieFeatures_W_B_vectors():
    file = open('./data/small_movies_X.csv', 'rb')
    movie_features = loadtxt(file, delimiter = ",")
    
    file = open('./data/small_movies_W.csv', 'rb')
    W = loadtxt(file,delimiter = ",")
    
    file = open('./data/small_movies_b.csv', 'rb')
    b = loadtxt(file,delimiter = ",")
    b = b.reshape(1,-1)
    num_movies, num_features = movie_features.shape
    num_users,_ = W.shape
    return(movie_features, W, b, num_movies, num_features, num_users)
    

In [63]:
def load_ratings_small():
    file = open('./data/small_movies_Y.csv', 'rb')
    movie_ratings = loadtxt(file,delimiter = ",")

    file = open('./data/small_movies_R.csv', 'rb')
    isRated = loadtxt(file,delimiter = ",")
    return(movie_ratings,isRated)

In [64]:
def load_Movie_List_pd():
    """ returns df with and index of movies in the order they are in in the movie_features matrix """
    df = pd.read_csv('./data/small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
    mlist = df["title"].to_list()
    return(mlist, df)

In [65]:
#Load data
X, W, b, num_movies, num_features, num_users = get_movieFeatures_W_B_vectors()
Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


### Getting cost using NumPy

In [66]:
def cofi_cost_function(X, W, b, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0
    for user in range(nu):
        w_j = W[user, :]
        b_j = b[0, user]
        for movie in range(nm):
            x = X[movie, :]
            y = Y[movie, user]
            r = R[movie, user]
            J += r * np.square(np.dot(w_j, x) + b_j - y)
    J = J/2
    J += (lambda_/2) * ( np.sum(np.square(W)) + np.sum(np.square(X)) )
    return J
    
            
    

In [67]:
# Reduce the data set size so that this runs faster
num_users_r = 4
num_movies_r = 5 
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_function(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

Cost: 13.67


In [68]:
# Evaluate cost function with regularization 
J = cofi_cost_function(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost (with regularization): 28.09


### Getting cost using tensorFlow

In [69]:
def cofi_cost_func_with_tensorFlow(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [70]:
# Evaluate cost function
J = cofi_cost_func_with_tensorFlow(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization 
J = cofi_cost_func_with_tensorFlow(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 13.67
Cost (with regularization): 28.09


In [71]:
movieList, movieList_df = load_Movie_List_pd()

### Creating a new user called Alice who enjoys romance drama movies

In [72]:
user1_alice = np.zeros(num_movies)      #  Initialize her ratings

### user1_alice = np.zeros(num_movies)      #  Initialize her ratings

In [73]:
user2_bob = np.zeros(num_movies)      #  Initialize his ratings

In [74]:
#Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
user1_alice[2700]

0.0

In [75]:
user2_bob[2609]

0.0

### Movies Alice enjoys

In [76]:

user1_alice[2683] = 5   # Blue Valentine
user1_alice[2635] = 4   # Remember Me (2010)
user1_alice[2654] = 5   # Date Night (2010)
user1_alice[2884] = 4   # Just Go with It
user1_alice[2970] = 4   # Crazy, Stupid, Love.
user1_alice[2971] = 4   # One Day
user1_alice[2977] = 4   # Friends with Benefits (2011)
user1_alice[2933] = 5   # Midnight in Paris

user1_alice[929]  = 2.5  # Lord of the Rings: The Return of the King, The
user1_alice[3843]  = 3   # The Hobbit: The Battle of the Five Armies (2014)
user1_alice[4208]  = 1   # Iron Man & Hulk: Heroes United (2013)
user1_alice[4758]  = 1   # Mission: Impossible - Fallout (2018)
user1_alice[2716] = 3 # Inception
user1_alice[1150] = 0.5   # Incredibles, The (2004)
user1_alice[366]  = 1   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
user1_alice[622]  = 1   # Harry Potter and the Chamber of Secrets (2002)
user1_alice[793]  = 2   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)

### Movies Bob seems to enjoy

In [77]:

user2_bob[929]  = 3  # Lord of the Rings: The Return of the King, The
user2_bob[3843]  = 5   # The Hobbit: The Battle of the Five Armies (2014)
user2_bob[4208]  = 5   # Iron Man & Hulk: Heroes United (2013)
user2_bob[4758]  = 4   # Mission: Impossible - Fallout (2018)
user2_bob[2716] = 5  # Inception
user2_bob[1150] = 4   # Incredibles, The (2004)
user2_bob[366]  = 4   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
user2_bob[622]  = 4   # Harry Potter and the Chamber of Secrets (2002)
user2_bob[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)

user2_bob[2683] = 1   # Blue Valentine
user2_bob[2635] = 2   # Remember Me (2010)
user2_bob[2654] = 2  # Date Night (2010)
user2_bob[2884] = 2.4  # Just Go with It
user2_bob[2970] = 3   # Crazy, Stupid, Love.
user2_bob[2971] = 2   # One Day
user2_bob[2977] = 1   # Friends with Benefits (2011)
user2_bob[2933] = 1   # Midnight in Paris

In [78]:
alice_rated = [i for i in range(len(user1_alice)) if user1_alice[i] > 0]
bob_rated = [i for i in range(len(user2_bob)) if user2_bob[i] > 0]

In [79]:
print('\nNew user Alice ratings\n')
for i in range( len(user1_alice) ):
    if user1_alice[i] > 0 :
      print(f'Rated {user1_alice[i]} for  {movieList_df.loc[i,"title"]}');


New user Alice ratings

Rated 1.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 1.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 2.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 2.5 for  Lord of the Rings: The Return of the King, The (2003)
Rated 0.5 for  Incredibles, The (2004)
Rated 4.0 for  Remember Me (2010)
Rated 5.0 for  Date Night (2010)
Rated 5.0 for  Blue Valentine (2010)
Rated 3.0 for  Inception (2010)
Rated 4.0 for  Just Go with It (2011)
Rated 5.0 for  Midnight in Paris (2011)
Rated 4.0 for  Crazy, Stupid, Love. (2011)
Rated 4.0 for  One Day (2011)
Rated 4.0 for  Friends with Benefits (2011)
Rated 3.0 for  The Hobbit: The Battle of the Five Armies (2014)
Rated 1.0 for  Iron Man & Hulk: Heroes United (2013)
Rated 1.0 for  Mission: Impossible - Fallout (2018)


In [80]:
# Combine new users' ratings as columns
new_users = np.array([
    user1_alice, 
    user2_bob
]).T  # Transpose to make them columns

In [81]:
# Reload ratings
Y, R = load_ratings_small()

In [82]:
# Add new user ratings to Y 
Y = np.c_[new_users, Y]

In [83]:
# Add new user indicator matrix to R
new_R = (new_users != 0).astype(int)
R = np.c_[new_R, R]

### normalizing the dataset to reduce bias ratings

In [84]:
    def normalizeRatings(Y, R):
        Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
        Ynorm = Y - np.multiply(Ymean, R) 
        return(Ynorm, Ymean)

In [85]:
# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [86]:
num_movies, num_users = Y.shape
num_features = 100

In [87]:
# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

In [88]:
# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [89]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_with_tensorFlow(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 2295067.6
Training loss at iteration 20: 134543.7
Training loss at iteration 40: 51158.8
Training loss at iteration 60: 24227.4
Training loss at iteration 80: 13411.8
Training loss at iteration 100: 8358.4
Training loss at iteration 120: 5727.3
Training loss at iteration 140: 4257.6
Training loss at iteration 160: 3396.4
Training loss at iteration 180: 2872.8


In [90]:
# Make a prediction using trained weights and biases
p = tf.matmul(X, W, transpose_b=True) + b

In [91]:
#restore the mean
pm = p + Ymean

In [92]:
# Get predictions for any user by their index
def get_user_predictions(user_index):
    return pm[:, user_index]

In [93]:
alice_predictions = get_user_predictions(0)
bob_predictions = get_user_predictions(1)

In [94]:
# sort predictions
alice_ix = tf.argsort(alice_predictions, direction='DESCENDING')
bob_ix = tf.argsort(bob_predictions, direction='DESCENDING')

In [95]:
for i in range(17):
    j = alice_ix[i]
    if j not in alice_rated:
        print(f'Predicting rating {alice_predictions[j]:0.2f} for movie {movieList[j]}')

Predicting rating 4.92 for movie Radio Day (2008)
Predicting rating 4.88 for movie English Vinglish (2012)
Predicting rating 4.86 for movie Delirium (2014)
Predicting rating 4.85 for movie One I Love, The (2014)
Predicting rating 4.85 for movie Laggies (2014)
Predicting rating 4.85 for movie Into the Forest of Fireflies' Light (2011)
Predicting rating 4.84 for movie Son of the Bride (Hijo de la novia, El) (2001)
Predicting rating 4.84 for movie Eichmann (2007)
Predicting rating 4.84 for movie Battle Royale 2: Requiem (Batoru rowaiaru II: Chinkonka) (2003)
Predicting rating 4.84 for movie Into the Abyss (2011)
Predicting rating 4.84 for movie Louis Theroux: Law & Disorder (2008)
Predicting rating 4.82 for movie O Brother, Where Art Thou? (2000)
Predicting rating 4.81 for movie Max Manus (2008)
Predicting rating 4.80 for movie A Detective Story (2003)
Predicting rating 4.80 for movie Superman/Batman: Public Enemies (2009)
Predicting rating 4.80 for movie Faster (2010)


In [96]:
for i in range(17):
    j = bob_ix[i]
    if j not in bob_rated:
        print(f'Predicting rating {bob_predictions[j]:0.2f} for movie {movieList[j]}')

Predicting rating 4.66 for movie Odd Life of Timothy Green, The (2012)
Predicting rating 4.65 for movie Martin Lawrence Live: Runteldat (2002)
Predicting rating 4.63 for movie Tyler Perry's I Can Do Bad All by Myself (2009)
Predicting rating 4.63 for movie Nine Lives of Tomas Katz, The (2000)
Predicting rating 4.62 for movie Bossa Nova (2000)
Predicting rating 4.62 for movie Satin Rouge (2002)
Predicting rating 4.62 for movie Paper Birds (Pájaros de papel) (2010)
Predicting rating 4.62 for movie Eva (2011)
Predicting rating 4.61 for movie Enter the Void (2009)
Predicting rating 4.61 for movie Unicorn City (2012)
Predicting rating 4.61 for movie Scooby-Doo! and the Loch Ness Monster (2004)
Predicting rating 4.61 for movie Dragons: Gift of the Night Fury (2011)
Predicting rating 4.61 for movie My Sassy Girl (Yeopgijeogin geunyeo) (2001)
Predicting rating 4.61 for movie 'Salem's Lot (2004)


In [97]:
print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(user1_alice)):
    if user1_alice[i] > 0:
        print(f'Original {user1_alice[i]}, Predicted {alice_predictions[i]:0.2f} for {movieList[i]}')



Original vs Predicted ratings:

Original 1.0, Predicted 1.24 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Original 1.0, Predicted 1.11 for Harry Potter and the Chamber of Secrets (2002)
Original 2.0, Predicted 2.01 for Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Original 2.5, Predicted 2.66 for Lord of the Rings: The Return of the King, The (2003)
Original 0.5, Predicted 0.78 for Incredibles, The (2004)
Original 4.0, Predicted 3.92 for Remember Me (2010)
Original 5.0, Predicted 4.66 for Date Night (2010)
Original 5.0, Predicted 4.63 for Blue Valentine (2010)
Original 3.0, Predicted 3.09 for Inception (2010)
Original 4.0, Predicted 3.85 for Just Go with It (2011)
Original 5.0, Predicted 4.90 for Midnight in Paris (2011)
Original 4.0, Predicted 4.03 for Crazy, Stupid, Love. (2011)
Original 4.0, Predicted 3.85 for One Day (2011)
Original 4.0, Predicted 3.90 for Friends with Benefits (2011)
Original 3.0, Predicted 2.

In [98]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred_ alice ratings"] = alice_predictions
movieList_df = movieList_df.reindex(columns=["pred_ alice ratings", "mean_rating", "number of ratings", "title"])
movieList_df.loc[alice_ix[:300]].loc[filter].sort_values("mean_rating", ascending=False)

Unnamed: 0_level_0,pred_ alice ratings,mean_rating,number of ratings,title
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1743,4.506924,4.252336,107,"Departed, The (2006)"
1242,4.419415,4.089744,39,Old Boy (2003)
1964,4.469798,4.06,25,3:10 to Yuma (2007)
2173,4.415829,4.057692,104,WALL·E (2008)
3714,4.433637,4.050847,59,Guardians of the Galaxy (2014)
1841,4.517287,4.0,61,Hot Fuzz (2007)
218,4.312077,3.934783,23,Amores Perros (Love's a Bitch) (2000)
1279,4.545464,3.846154,52,Million Dollar Baby (2004)
168,4.819536,3.808511,94,"O Brother, Where Art Thou? (2000)"
28,4.45884,3.666667,75,High Fidelity (2000)


In [100]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred_ bob ratings"] = bob_predictions
movieList_df = movieList_df.reindex(columns=["pred_ bob ratings", "mean_rating", "number of ratings", "title"])
movieList_df.loc[bob_ix[:300]].loc[filter].sort_values("mean_rating", ascending=False)

Unnamed: 0_level_0,pred_ bob ratings,mean_rating,number of ratings,title
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
793,4.932279,,149,Pirates of the Caribbean: The Curse of the Bla...
2716,4.914659,,143,Inception (2010)
2755,4.575467,,57,"Social Network, The (2010)"
3083,4.534215,,76,"Dark Knight Rises, The (2012)"
2420,4.431611,,105,Up (2009)
352,4.388657,,33,K-PAX (2001)
3802,4.375466,,50,The Imitation Game (2014)
676,4.359881,,75,City of God (Cidade de Deus) (2002)
393,4.356161,,198,"Lord of the Rings: The Fellowship of the Ring,..."
1122,4.349974,,77,Shaun of the Dead (2004)


In [101]:
def load_Movie_Rating_pd():
    """ returns df with and index of movies in the order they are in in the Y matrix """
    df = pd.read_csv('./data/small_movie_list.csv', header=0, index_col=0,  delimiter=',', quotechar='"')
    mlist = df["mean_rating"].to_list()
    return(mlist)

In [102]:
movies_rating = load_Movie_Rating_pd()

In [103]:
print(f'Alice predicted rating will be: {alice_predictions[2857]:0.2f} for {movieList[2857]}')
print(f'Movie average ratings is: {movies_rating[2857]}')

Alice predicted rating will be: 2.90 for No Strings Attached (2011)
Movie average ratings is: 2.928571429


In [104]:
print(f'Bob predicted rating will be: {bob_predictions[2857]:0.2f} for {movieList[2857]}')
print(f'Movie average ratings is: {movies_rating[2857]}')

Bob predicted rating will be: 2.32 for No Strings Attached (2011)
Movie average ratings is: 2.928571429


In [105]:
print(f'Alice predicted rating will be: {alice_predictions[3326]:0.2f} for {movieList[3326]}')
print(f'Movie average ratings is: {movies_rating[3326]}')

Alice predicted rating will be: 3.73 for Hobbit: An Unexpected Journey, The (2012)
Movie average ratings is: 3.8125


In [106]:
print(f'Bob predicted rating will be: {bob_predictions[3326]:0.2f} for {movieList[3326]}')
print(f'Movie average ratings is: {movies_rating[3326]}')

Bob predicted rating will be: 3.61 for Hobbit: An Unexpected Journey, The (2012)
Movie average ratings is: 3.8125


x
0       3.400000
1       3.250000
2       2.000000
3       2.000000
4       2.672414
          ...   
4773    3.500000
4774    4.000000
4775    3.500000
4776    3.500000
4777    3.500000
Name: mean_rating, Length: 4778, dtype: float64