In [None]:
import pandas as pd
import numpy as np
from utils.datasets import load_datasets, get_matrix_ratings, get_matrix_rated, get_n_users, get_n_movies, normalize_matrix_ratings, load_movies_enhanced
import tensorflow as tf
from thefuzz import process
from tensorflow import keras

In [None]:
# movies_titles = movies["title"]
# query = "memento"

# matches = process.extract(query, movies_titles, limit=5)

# for m in matches:
#     print(m)

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin

class CFRecommender(BaseEstimator, RegressorMixin):
    def __init__(self, n_users, n_movies, n_features=200, max_iterations=100, lambda_=1.5, learning_rate=0.1, intercept=True):
        self.n_users = n_users
        self.n_movies = n_movies
        self.n_features = n_features
        self.max_iterations = max_iterations
        self.lambda_ = lambda_
        self.learning_rate = learning_rate
        self.intercept = intercept
        
    def collaborative_filtering_cost(self, X, W, b, Y, R, lambda_):
        j = (tf.linalg.matmul(X, tf.transpose(W)) + (b if self.intercept else 0) - Y) * R
        J = 0.5 * tf.reduce_sum(j ** 2) + (lambda_ / 2) * (tf.reduce_sum(X ** 2) + tf.reduce_sum(W ** 2))
        return J
        
    def fit(self, Y, R):
        tf.random.set_seed(42)

        self.W = tf.Variable(tf.random.normal(shape=(self.n_users,  self.n_features), stddev=0.1, dtype=tf.float64),  name='W')
        self.X = tf.Variable(tf.random.normal(shape=(self.n_movies, self.n_features), stddev=0.1, dtype=tf.float64),  name='X')
        self.b = tf.Variable(tf.random.normal(shape=(1,             self.n_users), stddev=0.1, dtype=tf.float64),  name='b')

        optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate)
        
        for i in range(self.max_iterations):
            with tf.GradientTape() as tape:
                cost_value = self.collaborative_filtering_cost(self.X, self.W, self.b, Y, R, self.lambda_)
            
            if self.intercept:
                grads = tape.gradient(cost_value, [self.X,self.W,self.b])
                optimizer.apply_gradients(zip(grads, [self.X,self.W,self.b]))
            else:
                grads = tape.gradient(cost_value, [self.X,self.W])
                optimizer.apply_gradients(zip(grads, [self.X,self.W]))
        
            if i % 20 == 0:
                print(f"Training loss at iteration {i}: {cost_value:0.1f}")
        
        return self
                
    def predict(self):
        if self.intercept:
            return np.matmul(self.X.numpy(), np.transpose(self.W.numpy())) + self.b
        else:
            return np.matmul(self.X.numpy(), np.transpose(self.W.numpy()))
        
    
    def score(self, Y, R):
        return self.collaborative_filtering_cost(self.X, self.W, self.b, Y, R, self.lambda_)

In [None]:
class RatingsNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, R=None):
        self.R = R
        
        self.means_ = np.array([])
        
        for i in range(R.shape[0]):
            indexes = R[i] == 1
            self.means_ = np.append(self.means_, X[i][indexes].mean() if indexes.any() else 0)
            
        return self
            
    def transform(self, X):
        X_mean_normalized = X.copy()
        
        for i in range(X.shape[0]):
            if i%100 == 0:
                print(i)
            
            indexes = self.R[i] == 1
            X_mean_normalized[i][indexes] -= self.means_[i]
            # for j in range(X.shape[1]):
            #     if self.R[i,j] == 1:
            #         X_mean_normalized[i,j] -= self.means_[i]
        
        return X_mean_normalized

In [None]:
from scipy import sparse

sparse_matrix_ratings = sparse.load_npz('sparse_matrix_ratings.npz')

In [None]:
sparse_matrix_ratings_5000 = sparse_matrix_ratings[:,:5000]
matrix_ratings_5000 = sparse_matrix_ratings_5000.toarray()
matrix_ratings_5000.shape

In [None]:
matrix_rated_5000 = get_matrix_rated(matrix_ratings_5000)

In [None]:
ratings_normalizer = RatingsNormalizer()
ratings_normalizer.fit(matrix_ratings_5000, R=matrix_rated_5000)
matrix_ratings_5000_norm = ratings_normalizer.transform(matrix_ratings_5000)

In [None]:
n_movies, n_users = matrix_ratings_5000.shape
cf_recommender = CFRecommender(
    n_users=n_users,
    n_movies=n_movies,
    n_features=200,
    max_iterations=120,
    lambda_=1.5,
    learning_rate=0.1,
    intercept=True
)

In [None]:
cf_recommender.fit(matrix_ratings_5000_norm, matrix_rated_5000)

In [None]:
cf_recommender.X

In [None]:
links, movies, ratings, tags = load_datasets()
n_users = get_n_users(ratings)
n_movies = get_n_movies(movies)

In [None]:
Y = get_matrix_ratings(ratings, movies) # Matrix of ratings
R = get_matrix_rated(Y) # Matrix of 1/0 whether the movie was rated or not

In [None]:
my_ratings = np.zeros(n_movies)

my_ratings[0] = 5 # Toy Story
my_ratings[5374] = 5 # The incredibles
my_ratings[510] = 4.5 # Silence of the Lambs
my_ratings[4360] = 4 # Finding nemo
my_ratings[2379] = 4.5 # Stuart Little
my_ratings[1527] = 4 # The Parent Trap
my_ratings[3819] = 5 # Spider-Man (2002)
my_ratings[8406] = 4 # The Amazing Spider-Man 2
my_ratings[706] = 5 # 2001: A Space Odyssey
my_ratings[1691] = 4 # Rush hour

my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

#for i in range(len(my_rated)):
#    print(f"Rated {my_ratings[my_rated[i]]} for {movies.loc[my_rated[i], 'title']}")

In [None]:
Y = np.c_[my_ratings, Y] # Add my ratings
R = np.c_[(my_ratings != 0).astype(int), R] # Add my indicators to indicator matrix R

In [None]:
ratings_normalizer = RatingsNormalizer()
ratings_normalizer.fit(Y, R=R)
Ynorm = ratings_normalizer.transform(Y)

In [None]:
n_movies, n_users = Y.shape

cf_recommender = CFRecommender(
    n_users=n_users,
    n_movies=n_movies,
    n_features=200,
    max_iterations=120,
    lambda_=1.5,
    learning_rate=0.1,
    intercept=False
)

In [None]:
cf_recommender.fit(Ynorm, R)
p = cf_recommender.predict()

In [None]:
pm = p + ratings_normalizer.means_[:,np.newaxis]

In [None]:
my_predictions = pm[:,0]

ix = tf.argsort(my_predictions, direction='DESCENDING')

for i in range(17):
    j = ix[i]
    if j not in my_rated:
        print(f'Predicting rating {my_predictions[j.numpy()]:0.2f} for movie {movies["title"][j.numpy()]} (genres = {movies["genres"][j.numpy()]})')

print("\n\nOriginal vs. Predicted")
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movies["title"][i]} (genres = {movies["genres"][i]})')

In [None]:
movies_enhanced = load_movies_enhanced()
filter = (movies_enhanced["ratings"] > 20)
movies_enhanced["pred"] = my_predictions

movies_enhanced = movies_enhanced.reindex(columns=["pred", "mean_rating", "ratings", "title"])
movies_enhanced.loc[ix[:300]].loc[filter].sort_values("mean_rating", ascending=False)

In [None]:
def custom_grid_search(model_class, param_grid, Y, R):
    best_model = None
    best_score = np.inf
    best_params = {}
    
    from itertools import product
    for params in product(*param_grid.values()):
        params_dict = dict(zip(param_grid.keys(), params))
        model = model_class(n_users=Y.shape[1], n_movies=Y.shape[0], **params_dict)
    
        model.fit(Y, R)
        
        score = model.score(Y, R)
        
        if score < best_score:
            best_score = score
            best_model = model
            best_params = params_dict
            
        print(f"Tested {params_dict}, Score: {score}")
        
    return best_model, best_model, best_params
    

In [None]:
param_grid = {
    "n_features": [50,100, 150],
    "max_iterations": [100,150],
    "lambda_": [1,1.5],
    "learning_rate": [0.1,0.01],
    "intercept": [False,True]
}

best_model, best_score, best_params = custom_grid_search(CFRecommender, param_grid, Ynorm, R)

In [None]:
# {'n_features': 150, 'max_iterations': 100, 'lambda_': 1, 'learning_rate': 0.1, 'intercept': True}

param_grid = {
    "n_features": [140, 150, 160],
    "max_iterations": [90, 100, 110],
    "lambda_": [1],
    "learning_rate": [0.1],
    "intercept": [True]
}

best_model, best_score, best_params = custom_grid_search(CFRecommender, param_grid, Ynorm, R)

In [None]:
# {'n_features': 160, 'max_iterations': 110, 'lambda_': 1, 'learning_rate': 0.1, 'intercept': True}
param_grid = {
    "n_features": [160,170,180],
    "max_iterations": [100, 110, 120],
    "lambda_": [1],
    "learning_rate": [0.1],
    "intercept": [True]
}

best_model, best_score, best_params = custom_grid_search(CFRecommender, param_grid, Ynorm, R)

In [None]:
# {'n_features': 160, 'max_iterations': 110, 'lambda_': 1, 'learning_rate': 0.1, 'intercept': True}, Score: 3309.7440759128704

param_grid = {
    "n_features": [150,160,170],
    "max_iterations": [110, 120, 130],
    "lambda_": [1],
    "learning_rate": [0.1],
    "intercept": [True]
}

best_model, best_score, best_params = custom_grid_search(CFRecommender, param_grid, Ynorm, R)

In [None]:
#len(np.where((ratings_matrix_50000 == 0).all(axis=1))[0])