<a href="https://colab.research.google.com/github/paulusshewamre/Content-Collab-Hybrid-recsys/blob/main/CollabrativeFilteringSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import glob
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras


Load the neccesary datas and data retriveing functions


In [1]:
from google.colab import files
doc = files.upload()  # Upload kaggle.json here

Saving data.zip to data.zip


In [None]:
with zipfile.ZipFile("data.zip", 'r') as zip_ref:
    zip_ref.extractall("data")  # this will create a folder "data" in Colab

In [None]:
folder_path = "data/data"  # inside Colab environment

In [None]:
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
dfs = {os.path.basename(f).split(".")[0]: pd.read_csv(f) for f in csv_files}

print(dfs.keys())  # lists all CSV filenames without ".csv"

dict_keys(['small_movie_list', 'small_movies_R', 'small_movies_b', 'small_movies_Y', 'small_movies_W', 'small_movies_X'])


In [None]:
# Reload only 'small_movies_b' correctly
dfs['small_movies_b'] = pd.read_csv('./data/data/small_movies_b.csv', header=None)

# Convert to NumPy and reshape dynamically
movies_b = dfs['small_movies_b'].to_numpy().reshape(1, -1)

In [None]:
dataframes = list(dfs.values())

# Access each file by index
movie_list = dataframes[0].to_numpy()
movies_R = dataframes[1].to_numpy()
movies_Y = dataframes[3].to_numpy()
movies_W = dataframes[4].to_numpy()
movies_X = dataframes[5].to_numpy()

In [None]:
def load_precalc_params_small():
    """
    Load precomputed parameters from the already loaded CSV DataFrames in `dfs`.
    Assumes dfs contains:
    'movies_X', 'movies_W', 'movies_b', etc.
    """

    # Access DataFrames by name from dfs
    X = movies_X
    W = movies_W
    b = movies_b

    num_movies, num_features = X.shape
    num_users, _ = W.shape

    return X, W, b, num_movies, num_features, num_users


In [None]:
def load_ratings_small():
    """
    Load Y and R matrices from the preloaded CSV DataFrames in `dfs`.
    Assumes dfs contains:
    'movies_Y' and 'movies_R'.
    """

    Y = movies_Y
    R = movies_R

    return Y, R


In [None]:
#Load data
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4777, 443) R (4777, 443)
X (4777, 10)
W (442, 10)
b (1, 443)
num_features 10
num_movies 4777
num_users 442


In [None]:
#  From the matrix, we can compute statistics like average rating.
tsmean =  np.mean(Y[0, R[0, :].astype(bool)])
print(f"Average rating for movie 1 : {tsmean:0.3f} / 5" )

Average rating for movie 1 : 3.250 / 5


In [None]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    nm, nu = Y.shape
    J = 0
    ### START CODE HERE ###

    for j in range(nu):
        w = W[j,:]
        b_j = b[0,j]
        for i in range(nm):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += np.square(r * (np.dot(w,x) + b_j - y ) )
    J = J/2
    J += (lambda_/2) * (np.sum(np.square(W)) + np.sum(np.square(X)))

    ### END CODE HERE ###

    return J

In [None]:
# Reduce the data set size so that this runs faster
num_users_r = 4
num_movies_r = 5
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

# Evaluate cost function
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

Cost: 11.66


In [None]:
# Evaluate cost function with regularization
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost (with regularization): 24.77


Vectoriazation method for faster calculations

In [None]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [None]:
# Evaluate cost function
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 11.66
Cost (with regularization): 24.77


In [None]:
def load_Movie_List_pd():
    """
    Returns a list of movie titles and the corresponding DataFrame,
    using the preloaded DataFrame from dfs.
    Assumes dfs contains 'small_movie_list'.
    """

    # Access the DataFrame from dfs
    df = dfs['small_movie_list']

    # Convert the "title" column to a list
    mlist = df["title"].to_list()

    return mlist, df


Make initial Prediction without trained model

In [None]:
movieList, movieList_df = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)          #  Initialize my ratings

# Check the file small_movie_list.csv for id of each movie in our dataset
# For example, Toy Story 3 (2010) has ID 2700, so to rate it "5", you can set
my_ratings[2700] = 5

#Or suppose you did not enjoy Persuasion (2007), you can set
my_ratings[2609] = 2;

# We have selected a few movies we liked / did not like and the ratings we
# gave are as follows:
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movieList_df.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Shrek (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for  Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for  Incredibles, The (2004)
Rated 2.0 for  Persuasion (2007)
Rated 5.0 for  Toy Story 3 (2010)
Rated 3.0 for  Inception (2010)
Rated 1.0 for  Louis Theroux: Law & Disorder (2008)
Rated 1.0 for  Nothing to Declare (Rien à déclarer) (2010)


Normalize the ratings

In [None]:
import numpy as np

def normalizeRatings(Y, R):
    """
    Normalize movie ratings by subtracting the mean rating for each movie.

    Parameters:
        Y : numpy array of shape (num_movies, num_users)
            Ratings matrix
        R : numpy array of same shape as Y
            Indicator matrix where R[i,j] = 1 if movie i was rated by user j

    Returns:
        Ynorm : numpy array of same shape as Y
            Normalized ratings (mean of each movie subtracted, unrated entries remain 0)
        Ymean : numpy array of shape (num_movies, 1)
            Mean rating for each movie
    """

    # Compute mean rating for each movie, ignoring unrated entries
    # Add a small epsilon to avoid division by zero
    Ymean = (np.sum(Y * R, axis=1) / (np.sum(R, axis=1) + 1e-12)).reshape(-1, 1)

    # Subtract the mean rating from all rated entries
    Ynorm = Y - np.multiply(Ymean, R)

    return Ynorm, Ymean


Using Neural network find the optimal parameters (Gradient Descent)

---



In [None]:
# Reload ratings
Y, R = load_ratings_small()

# Add new user ratings to Y
Y = np.c_[my_ratings, Y]

# Add new user indicator matrix to R
R = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [None]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [None]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 2291969.9
Training loss at iteration 20: 134129.7
Training loss at iteration 40: 50764.9
Training loss at iteration 60: 24024.3
Training loss at iteration 80: 13321.8
Training loss at iteration 100: 8307.8
Training loss at iteration 120: 5688.4
Training loss at iteration 140: 4224.1
Training loss at iteration 160: 3367.9
Training loss at iteration 180: 2849.1


Make Predictions with the optimal parameters recived from model

In [None]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}')

Predicting rating 5.63 for movie Purge: Anarchy, The (2014)
Predicting rating 5.23 for movie Ginger Snaps: Unleashed (2004)
Predicting rating 5.20 for movie I Heart Huckabees (2004)
Predicting rating 5.20 for movie Igby Goes Down (2002)
Predicting rating 5.18 for movie Promised Land (2012)
Predicting rating 5.18 for movie Last Knights (2015)
Predicting rating 5.18 for movie Whole Nine Yards, The (2000)
Predicting rating 5.18 for movie Dog Days (Hundstage) (2001)
Predicting rating 5.18 for movie Che: Part One (2008)
Predicting rating 5.18 for movie Day the Earth Stood Still, The (2008)
Predicting rating 5.18 for movie Jeff Ross Roasts Criminals: Live at Brazos County Jail (2015)
Predicting rating 5.18 for movie Kicking Off (2016)
Predicting rating 5.18 for movie Secret Ballot (Raye makhfi) (2001)
Predicting rating 5.18 for movie Into the Grizzly Maze (2015)
Predicting rating 5.18 for movie Brave (2012)
Predicting rating 5.18 for movie The Death of Stalin (2017)
Predicting rating 5.18 fo