# Recommender Systems: Lab. 3

#### Nikolaos Athanasopoulos
#### Zoltan Kunos

## Kaggle Collaborative Filtering Approach

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from google.colab import files

data = pd.read_csv('train.csv')
data.head()

In [None]:
data.rename(columns={'release_date': 'type','sex': 'age', 'age': 'sex'}, inplace=True)

In [None]:
data.head()

In [None]:
#### Create a function that allows us to divide the dataset into:
#### training and test
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

def create_train_test(data,key = 'user_id'):
    data['for_testing'] = False
    grouped = data.groupby(key, group_keys=False).apply(assign_to_set)
    # dataframe used to train our model
    data_train = data[grouped.for_testing == False]
    # dataframe used to evaluate our model
    data_test = data[grouped.for_testing == True]
    return data_train, data_test


In [None]:
train, test =  create_train_test(data)
print(train.shape, test.shape)

print("Training data_set has "+ str(train.shape[0]) +" ratings")
print("Test data set has "+ str(test.shape[0]) +" ratings")
print("La BD has ", data.movie_id.nunique(), " movies")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

## Add another Loss Function
def log_loss(self, pred, real):
    """ Log loss error """
    return np.log(np.exp(-pred * real) + 1.0)

def precision(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):  
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def AP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [None]:
def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.user_id, data_test.movie_id)
    estimated = np.array([estimate_f(u,i) if u in data_train.user_id else 3 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    
    return compute_rmse(estimated, real)


def evaluate_algorithm_top(test, recommender_object, at=25, thr_relevant = 4):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0
    
    num_eval = 0


    for user_id in tqdm(test.user_id.unique()):

        relevant_items = test[(test.user_id==user_id )&( test.rating>=thr_relevant)].movie_id.values
        
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.predict_top(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)
            
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval
    
    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))

In [None]:
from tqdm import tqdm # conda install -y tqdm

class CollaborativeFiltering:
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self, _type = 'user'):
        """ Constructor """
        self._type = _type
    
    def fit(self,df_train,shrink = 10):
        """ Prepare data structures for estimation. Similarity matrix for users """
        print("Learning...")
        self.df_train=df_train
        self.urm_train = pd.pivot_table(df_train[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating',fill_value=0).values
        self.index2item_id = np.array(pd.pivot_table(df_train[['user_id','movie_id','rating']],columns='user_id',index='movie_id',values='rating',fill_value=0).index)
        self.index2user_id = np.array(pd.pivot_table(df_train[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating',fill_value=0).index)
        
        self.item_id2index = {}
        for i in range(len(self.index2item_id)):
            self.item_id2index[self.index2item_id[i]] = i
            
        self.user_id2index = {}
        for i in range(len(self.index2user_id)):
            self.user_id2index[self.index2user_id[i]] = i
            
            
        self.num_items = len(self.index2item_id)
        self.num_users = len(self.index2user_id)
        self.movie_id2title = {}
        for row in df_train[['movie_id','title']].drop_duplicates().values:
            self.movie_id2title[row[0]] = row[1]
        
        
        if(self._type =='user'): # USER BASED
            print("Computing user similarities")
            self.sim_matrix = np.zeros((self.num_users,self.num_users))   
            
            user_norms = np.sqrt(np.power(self.urm_train,2).sum(axis=1)).ravel()
            for user_index in tqdm(range(self.num_users)):
                # compute cosine distance
                numerator_vector = self.urm_train[user_index].dot(self.urm_train.T).ravel()
                denominator_vector = user_norms[user_index] * user_norms + shrink + 1e-6

                similarity_vector = numerator_vector/denominator_vector
                self.sim_matrix[user_index,:] = similarity_vector
            
            
        elif(self._type=='item'): ## ITEM BASED
            print("Computing item similarities")
            self.sim_matrix = np.zeros((self.num_items,self.num_items))
            item_norms = np.sqrt(np.power(self.urm_train,2).sum(axis=0)).ravel()
            for item_index in tqdm(range(self.num_items)):
                # compute cosine distance
                numerator_vector = self.urm_train.T[item_index].dot(self.urm_train).ravel()
                denominator_vector = item_norms[item_index] * item_norms + shrink + 1e-6

                similarity_vector = numerator_vector/denominator_vector
                self.sim_matrix[item_index,:] = similarity_vector
                
    def predict_score(self, user_id, movie_id):
        if movie_id not in self.item_id2index:
            return self.df_train.rating.mean()
            
        user_index = self.user_id2index[user_id]
        item_index = self.item_id2index[movie_id]
        
        if(self._type=='user'):
            rating_num = self.urm_train.T[item_index,:].dot(self.sim_matrix[user_index,:])
            rating_den = np.sum((self.urm_train.T[item_index,:]>0).dot(self.sim_matrix[user_index,:]))
        elif(self._type=='item'):
            rating_num = self.urm_train[user_index,:].dot(self.sim_matrix[item_index,:])
            rating_den = np.sum((self.urm_train[user_index,:]>0).dot(self.sim_matrix[item_index,:]))
            
        if rating_den == 0:
            return self.df_train.rating.mean()
        else:
            return rating_num/rating_den
        
    def predict_top(self, user_id, at=5, remove_seen=True):
        '''Given a user_id predict its top AT items'''
        seen_items = self.df_train[self.df_train.user_id==user_id].movie_id.values
        unseen_items = set(self.df_train.movie_id.values) - set(seen_items)

        predictions = [(item_id,self.predict_score(user_id,item_id)) for item_id in unseen_items]

        sorted_predictions = sorted(predictions, key=lambda x: x[1],reverse = True)[:at]
        return [i[0] for i in sorted_predictions]

In [None]:
rec_object = CollaborativeFiltering()
rec_object.fit(train)
rec_object.predict_score(user_id=2,movie_id=1)

In [None]:
test = pd.read_csv('kaggle_baseline.csv')

In [None]:
import csv
from tqdm import tqdm

# open the file in the write mode
with open('solution.csv', 'w',encoding='UTF8') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(['user_id', 'prediction'])
    for user_id in tqdm(test.user_id.unique()):
      try:
          relevant_items = rec_object.predict_top(user_id, at=25)
          list_relevants = ' '.join([str(elem) for elem in relevant_items])
          writer.writerow([str(user_id),list_relevants])
      except KeyError as e:
          print(f"Error: {e}. User ID {user_id} not found.")


In the first 3 approaches we followed the corresponding notebooks that are uploaded at Campus Virtual, which are the following:
- Collaborative Filtering using SVD Decomposition
- The Vanilla Matrix Factorization Model
- The Vanilla Matrix Factorization Model with biases

## Collaborative Filtering using SVD Decomposition

In [None]:
from scipy import sparse
from scipy.linalg import sqrtm

class RecSys_mf():
    """ Collaborative filtering using SVD. """
    
    def __init__(self, num_components=10):
        """ Constructor """
        self.num_components=num_components
        
        
        
    def fit(self,df_train):
        """ We decompose the R matrix into to submatrices using the training data """
        
        self.train = df_train
        self.urm = pd.pivot_table(df_train[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating')
        
        # We create a dictionary where we will store the user_id and movie_id which correspond 
        # to each index in the Rating matrix
        
        user_index = np.arange(len(self.urm.index))
        self.users = dict(zip(user_index,self.urm.index ))
        self.users_id2index = dict(zip(self.urm.index,user_index)) 
        
        movie_index = np.arange(len(self.urm.columns))
        self.movies = dict(zip(movie_index,self.urm.columns )) 
        self.movies_id2index= dict(zip(self.urm.columns, movie_index))
        self.movies_index2id= dict(zip(movie_index,self.urm.columns))
        self.movie_id2title = dict(df_train.groupby(by=['movie_id','title']).count().index)
        
        self.pop_items = reco.train.groupby('movie_id').count()[['rating']]

        train_matrix = np.array(self.urm)
        # we mask those nan value to fill with the mean 
        mask = np.isnan(train_matrix)
        masked_arr = np.ma.masked_array(train_matrix, mask)
        item_means = np.mean(masked_arr, axis=0)

        # nan entries will replaced by the average rating for each item
        train_matrix = masked_arr.filled(item_means)
        x = np.tile(item_means, (train_matrix.shape[0],1))         

        # we remove the per item average from all entries.
        # the above mentioned nan entries will be essentially zero now
        train_matrix = train_matrix - x
        U, s, V = np.linalg.svd(train_matrix, full_matrices=False)

        # reconstruct rating matix
        S = np.diag(s[0:self.num_components])
        U = U[:,0:self.num_components]
        V = V[0:self.num_components,:]
        S_root = sqrtm(S)

        USk=np.dot(U,S_root)
        SkV=np.dot(S_root,V)
        Y_hat = np.dot(USk, SkV)
        self.Y_hat = Y_hat + x
        
    def predict_score(self, user_id, movie_id):
        
        if movie_id in self.movies_id2index:
            return self.Y_hat[self.users_id2index[user_id],self.movies_id2index[movie_id]]
        else: # in case it is a new movie 
            return 3

        
    def predict_top(self, user_id, at=5, filter_pop = 100, remove_seen=True):
        '''Given a user_id predict its top AT items'''
        seen_items = self.train[self.train.user_id==user_id].movie_id.values
        unseen_items = set(self.train.movie_id.values) - set(seen_items)
        # filter the non popular items
        #unseen_items = [item for item  in set(self.pop_items[self.pop_items.rating>filter_pop].index) if item in unseen_items]
        predictions = [(item_id,self.predict_score(user_id,item_id)) for item_id in unseen_items]

        sorted_predictions = sorted(predictions, key=lambda x: x[1],reverse = True)[:at]
        return [i[0] for i in sorted_predictions]

In [None]:
def evaluate(predict_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.user_id, data_test.movie_id)
    list_users = set(data_train.user_id)
    estimated = np.array([predict_f(u,i) if u in list_users else 3 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

def precision(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):  
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def f1_score(precision, recall):
    """ Compute F1 score. """
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def AP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

## Divide the data in two sets: training and test
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

def evaluate_algorithm_top(test, recommender_object, at=25, thr_relevant = 0.85):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0
    cumulative_f1 = 0.0
    num_eval = 0


    for user_id in tqdm(test.user_id.unique()):
        
        relevant_items = test[test.user_id==user_id]
        thr = np.quantile(relevant_items.rating,thr_relevant)
        relevant_items = np.array(relevant_items[relevant_items.rating >=thr].movie_id.values)
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.predict_top(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)
            
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval
    f1 = f1_score(cumulative_precision, cumulative_recall) 
    
    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}, F1 = {:.4f}".format(
    cumulative_precision, cumulative_recall, MAP, f1))
    
    

In [None]:
from tqdm import tqdm

reco = RecSys_mf(num_components=30)
reco.fit(train)
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.predict_score,train,test))
evaluate_algorithm_top(test, reco, at = 25)

## The Vanilla Matrix Factorization Model

In [None]:
from scipy import sparse

class RecSys_vanilla_mf(RecSys_mf):
    """ Collaborative filtering using a custom sim(u,u'). """

    def __sdg__(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            user_id = self.users[u]
            item_id = self.movies[i]
            
            prediction = self.predict_score(user_id, item_id)
            error = (self.ratings[u,i] - prediction) # error
            
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * \
                                    (error * self.item_vecs[i, :] - \
                                     self.lmbda * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * \
                                    (error * self.user_vecs[u, :] - \
                                     self.lmbda * self.item_vecs[i,:])
                
                
    def fit(self,df_train,df_val, n_epochs = 10,learning_rate =0.001,lmbda=0.1,verbose =True):
        """ We decompose the R matrix into to submatrices using the training data """
        self.train = df_train
        self.val = df_val
        self.urm = pd.pivot_table(df_train[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating')
        
        # We create a dictionary where we will store the user_id and movie_id which correspond 
        # to each index in the Rating matrix
        
        user_index = np.arange(len(self.urm.index))
        self.users = dict(zip(user_index,self.urm.index ))
        self.users_id2index = dict(zip(self.urm.index,user_index)) 
        
        movie_index = np.arange(len(self.urm.columns))
        self.movies = dict(zip(movie_index,self.urm.columns )) 
        self.movies_id2index= dict(zip(self.urm.columns, movie_index))
        self.movies_index2id= dict(zip(movie_index,self.urm.columns))
        self.movie_id2title = dict(df_train.groupby(by=['movie_id','title']).count().index)
        
        
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        
        self.mean_rating = self.train.rating.mean()
        
        self.ratings = np.float32(self.urm.fillna(0).values)
        self.n_users, self.n_items = self.urm.shape
        self.sample_row, self.sample_col = self.ratings.nonzero()
        self.n_samples = len(self.sample_row)
        
        self.train_rmse =[]
        self.test_rmse = []
        iter_diff = 0
        
        # initialize latent vectors
        self.user_vecs = self.mean_rating*np.random.normal(scale=1./self.num_components,\
                                          size=(self.n_users, self.num_components))
        self.item_vecs = self.mean_rating*np.random.normal(scale=1./self.num_components,
                                          size=(self.n_items, self.num_components))

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            
            self.training_indices = np.arange(self.n_samples)
            
            #shuffle training samples
            np.random.shuffle(self.training_indices)
            self.__sdg__()
            
            self.train_rmse.append(evaluate(self.predict_score,self.train,self.train))
            self.test_rmse.append(evaluate(self.predict_score,self.train,self.val))
            
            
            print('\tTrain rmse: %s' % self.train_rmse[-1])
            print('\tTest rmse: %s' % self.test_rmse[-1])
            
        
        if(self.verbose):
            self.__plot_learning_curves__()
    
    def __plot_learning_curves__(self):
        plt.plot(self.train_rmse,'--o',label="train_error")
        plt.plot(self.test_rmse,'--o',label="test_error")
        plt.legend()
        plt.show()
        
    def predict_score(self, user_id, movie_id):
        """ Single user and item prediction."""
        user_index = self.users_id2index[user_id]
        if movie_id in self.movies_id2index:
            item_index = self.movies_id2index[movie_id]
            prediction =  self.user_vecs[user_index, :].dot(self.item_vecs[item_index, :].T)
        else:
            prediction = self.mean_rating # this is a new movie
        
        return prediction
    

In [None]:
reco = RecSys_vanilla_mf(num_components=5)
reco.fit(train,test, n_epochs = 5,learning_rate=0.01,lmbda=0.5)
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.predict_score,train,test))

In [None]:
reco = RecSys_vanilla_mf(num_components=5)
reco.fit(train,test, n_epochs = 5,learning_rate=0.01)
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.predict_score,train,test))

In [None]:
reco = RecSys_vanilla_mf(num_components=5)
reco.fit(train,test, n_epochs = 15,learning_rate=0.02)
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.predict_score,train,test))

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
movie_id = 1 #'Toy Story (1995)'
movie_id = 364 #'Lion King, The (1994)'

pairwise_distances = euclidean_distances(reco.item_vecs, reco.item_vecs)
[(reco.movie_id2title[reco.movies_index2id[item]],
  item) for item in np.argsort(pairwise_distances[reco.movies_id2index[movie_id]])[0:6]]

In [None]:
evaluate_algorithm_top(test, reco, at = 25)

## The Vanilla Matrix Factorization Model with biases

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

class RecSys_vanilla_mf_biases(RecSys_vanilla_mf):
                  
    def __sdg__(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            user_id = self.users[u]
            item_id = self.movies[i]
            
            prediction = self.predict_score(user_id, item_id)
            error = (self.ratings[u,i] - prediction) # error
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * \
                                    (error * self.item_vecs[i, :] - self.lmbda * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * \
                                    (error * self.user_vecs[u, :] - self.lmbda * self.item_vecs[i,:])
            
            self.bias_item[i] += self.learning_rate * (error - self.lmbda * self.bias_item[i]) 
            self.bias_user[u] += self.learning_rate * (error - self.lmbda * self.bias_user[u]) 
            
                
    def fit(self,df_train,df_val, n_epochs = 10,learning_rate =0.001,lmbda=0.1,verbose =True):
        """ Train the model. """
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        
        self.train = df_train
        self.val = df_val
        
        self.urm = pd.pivot_table(df_train[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating')
        
        # We create a dictionary where we will store the user_id and movie_id which correspond 
        # to each index in the Rating matrix
        
        user_index = np.arange(len(self.urm.index))
        self.users = dict(zip(user_index,self.urm.index ))
        self.users_id2index = dict(zip(self.urm.index,user_index)) 
        
        movie_index = np.arange(len(self.urm.columns))
        self.movies = dict(zip(movie_index,self.urm.columns )) 
        self.movies_id2index= dict(zip(self.urm.columns, movie_index))
        self.movies_index2id= dict(zip(movie_index,self.urm.columns))
        self.movie_id2title = dict(df_train.groupby(by=['movie_id','title']).count().index)
        
        
        
        self.mean_rating = self.train.rating.mean()
        
        self.ratings = np.float32(self.urm.fillna(0).values)
        self.n_users, self.n_items = self.urm.shape
        self.sample_row, self.sample_col = self.ratings.nonzero()
        self.n_samples = len(self.sample_row)
        print(self.n_samples)
        self.train_rmse =[]
        self.test_rmse = []
        iter_diff = 0
        
        # initialize latent vectors
        self.user_vecs = self.mean_rating*np.random.normal(scale=1./self.num_components,\
                                          size=(self.n_users, self.num_components))
        self.item_vecs = self.mean_rating*np.random.normal(scale=1./self.num_components,
                                          size=(self.n_items, self.num_components))
        self.bias_item = np.random.normal(scale=1/self.n_items,size=(self.n_items))
        self.bias_user = np.random.normal(scale=1/self.n_users,size=(self.n_users))
        
        
        for epoch in range(n_epochs):
    
            print('Epoch: {}'.format(epoch))
            
            self.training_indices = np.arange(self.n_samples)
            
            #shuffle training samples
            np.random.shuffle(self.training_indices)
            self.__sdg__()
            
            self.train_rmse.append(evaluate(reco.predict_score,self.train,self.train))
            self.test_rmse.append(evaluate(reco.predict_score,self.train,self.val))
            
            print('\tTrain rmse: %s' % self.train_rmse[-1])
            print('\tTest rmse: %s' % self.test_rmse[-1])
        
        if(self.verbose):
            self.__plot_learning_curves__()
    
    def predict_score(self, user_id, movie_id):
        """ Single user and item prediction."""
        user_index = self.users_id2index[user_id]
        if movie_id in self.movies_id2index:
            item_index = self.movies_id2index[movie_id]
            prediction =  self.mean_rating + self.user_vecs[user_index, :].dot(self.item_vecs[item_index, :].T) + self.bias_item[item_index] + self.bias_user[user_index]
        else:
            prediction = self.mean_rating # this is a new movie

        return prediction
    

In [None]:
reco = RecSys_vanilla_mf_biases(num_components=5)
reco.fit(train,test, n_epochs = 5,learning_rate=0.02)
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.predict_score,train,test))

In [None]:
evaluate_algorithm_top(test, reco, at = 25)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
movie_id = 1 #'Toy Story (1995)'
#movie_id = 364 #'Lion King, The (1994)'

pairwise_distances = euclidean_distances(reco.item_vecs, reco.item_vecs)
[(reco.movie_id2title[reco.movies_index2id[item]],
  item) for item in np.argsort(pairwise_distances[reco.movies_id2index[movie_id]])[0:6]]

In [None]:
reco = RecSys_vanilla_mf_biases(num_components=100)
reco.fit(train,test, n_epochs = 50,learning_rate=0.02,lmbda=0.1)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
movie_id = 1 #'Toy Story (1995)'
#movie_id = 364 #'Lion King, The (1994)'

pairwise_distances = euclidean_distances(reco.item_vecs, reco.item_vecs)
[(reco.movie_id2title[reco.movies_index2id[item]],
  item) for item in np.argsort(pairwise_distances[reco.movies_id2index[movie_id]])[0:6]]

In [None]:
evaluate_algorithm_top(test, reco, at = 25)

- Sampling the input data
- Changing the Loss Function

## The Approach that we finally have selected and worked with

## Matrix Factorization with Keras

Matrix Factorization (MF) is a popular approach for building movie recommender systems. The main idea behind MF is to represent the user-item interaction matrix as a product of two low-rank matrices, which can be thought of as user and item embeddings. One of the main advantages of using MF in movie recommender systems is that it can handle large and sparse datasets efficiently.

Another advantage of using MF is that it can handle cold-start and new-item problems. Cold-start refers to the scenario where there is no information available about a new user or a new item, making it difficult to make recommendations. MF can handle cold-start problems by using the low-rank embeddings to generalize across similar users or items. Similarly, when a new item is added to the system, MF can use the existing user and item embeddings to make predictions about its rating.

Overall, MF is a powerful approach for building movie recommender systems because it can handle large and sparse datasets efficiently, can handle cold-start and new-item problems, and can learn low-dimensional representations that capture the underlying structure of the user-item interaction matrix.



The steps that we are going to follow in this approach are the following:
- Map user ID to a "user vector" via an embedding matrix
- Map movie ID to a "movie vector" via an embedding matrix
- Compute the dot product between the user vector and movie vector, to obtain the a match score between the user and the movie (predicted rating).
- Train the embeddings via gradient descent using all known user-movie pairs.

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm

<b>Preprocessing</b>

First, need to perform some preprocessing to encode users and movies as integer indices.

In [None]:
user_ids = data["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = data["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
data["user"] = data["user_id"].map(user2user_encoded)
data["movie"] = data["movie_id"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
data["rating"] = data["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(data["rating"])
max_rating = max(data["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

In [None]:
data.head()

<b>Loss Function</b>

Bayesian Personalized Ranking (BPR) is a loss function that is commonly used in recommendation systems and more specifically a pairwise learning-to-rank algorithm that aims to learn a personalized ranking model, that can accurately predict the preferences of a user for items based on their interactions with the system.

The BPR loss function is useful in movie recommender systems because it takes into account not only the positive feedback (i.e., user ratings) but also the negative feedback (i.e., items that were not rated by the user). This is important because in many recommendation scenarios, users may only interact with a small subset of the available items, and negative feedback can be a valuable source of information for learning personalized user preferences.

The BPR loss function is based on the assumption that a user prefers items that they have rated higher than items they have not rated. The bpr_loss function takes two input arguments: y_true and y_pred, which represent the true and predicted values for the pairwise preferences. The function calculates the pairwise difference matrix by taking the transpose of y_pred and subtracting it from y_pred. This matrix contains the differences between the predicted values for each pair of items. A mask is created to only consider pairs where the true value for the first item in the pair is greater than the true value for the second item in the pair. This mask is created by applying the greater function to the pairwise difference matrix. The logarithm of the sigmoid of the pairwise differences is calculated using the log_sigmoid function from TensorFlow's math module. This gives a measure of how likely the model is to predict the first item as being preferred over the second item. The mask is applied to the pairwise log sigmoid matrix by element-wise multiplication. This effectively removes the terms where the true preference for the second item is greater than the true preference for the first item. The loss is calculated as the mean of the masked pairwise log sigmoid. This is divided by the sum of the mask to ensure that only the relevant terms are included in the loss calculation. Finally, the loss value is returned as the output of the bpr_loss function.

In [None]:
def bpr_loss(y_true, y_pred):
    # Reshape y_true and y_pred to match shape (batch_size, 1)
    y_true = tf.reshape(y_true, (-1, 1))
    y_pred = tf.reshape(y_pred, (-1, 1))

    # Calculate the pairwise difference matrix
    pairwise_diff = tf.transpose(y_pred) - y_pred

    # Create a mask to only consider pairs where y_true[i] > y_true[j]
    mask = tf.cast(tf.greater(pairwise_diff, 0), dtype=tf.float32)

    # Calculate the log of the sigmoid of the pairwise differences
    pairwise_log_sigmoid = -tf.math.log_sigmoid(pairwise_diff)

    # Apply the mask to the pairwise log sigmoid
    masked_pairwise_log_sigmoid = tf.multiply(pairwise_log_sigmoid, mask)

    # Calculate the mean of the masked pairwise log sigmoid
    loss = tf.reduce_sum(masked_pairwise_log_sigmoid) / tf.reduce_sum(mask)

    return loss


- Other Loss Functions that we used but gave us worse results are: MSE, Cross-Entropy, LamdaLoss.

<b>MF Model</b>

We embed both users and movies in to 20-dimensional vectors.

The model computes a match score between user and movie embeddings via a dot product, and adds a per-movie and per-user bias.

In [None]:
EMBEDDING_SIZE = 20

class RecommenderNetV(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNetV, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_movie_embedding = layers.Embedding(
            num_users+num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_movie_embedding(inputs[:, 0])
        movie_vector = self.user_movie_embedding(inputs[:, 1]+num_users)
        user_bias = self.user_bias(inputs[:, 0])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        return x


mf_model = RecommenderNetV(num_users, num_movies, EMBEDDING_SIZE)
mf_model.compile(
    loss=bpr_loss,#tf.keras.losses.MeanSquaredError(),#tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.005)
)

<b>Split Training and Validation Data</b>

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(data, test_size=0.2, random_state=7)

In [None]:
x_train = train[["user", "movie"]].values
y_train = train["rating"].values

x_val = val[["user", "movie"]].values
y_val = val["rating"].values

<b>Train and Evaluate MF model</b>

In [None]:
mf_history = mf_model.fit(
    x=x_train,
    y=y_train,
    batch_size=128,
    epochs=200,
    verbose=1,
    validation_data=(x_val, y_val),
)

<b>Top Recommendations Function</b>

In [None]:
def top_recomendations(model_object, user_id, train, movie_df,at = 5):
    movies_watched_by_user = train[train.user_id == user_id]

    movies_not_watched = movie_df[~movie_df["movie_id"].isin(movies_watched_by_user.movie_id.values)]["movie_id"]
    movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.keys())))
    movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]

    
    user_encoder = user2user_encoded.get(user_id)
    user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))

    ratings = model_object.predict(user_movie_array,verbose=0).flatten()
    top_ratings_indices = ratings.argsort()[-at:][::-1]
    recommended_movie_ids = [movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices]
    return recommended_movie_ids
    
r = top_recomendations(mf_model, 4,train,data,10)
print(r) 

<b>Evaluation Metrics</b>

In [None]:
def precision(recommended_items, relevant_items):
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    
    return precision_score

def recall(recommended_items, relevant_items):  
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def AP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

def evaluate_algorithm_top(train, test, recommender_object, movie_df, at=25, thr_relevant = 0.85):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0
    
    num_eval = 0


    for user_id in tqdm(test.user_id.unique()):
        
        relevant_items = test[test.user_id==user_id]
        thr = np.quantile(relevant_items.rating,thr_relevant)
        relevant_items = np.array(relevant_items[relevant_items.rating >=thr].movie_id.values)
        if len(relevant_items)>0:
            
            recommended_items = top_recomendations(recommender_object,user_id, train, movie_df, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)
            
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval
    
    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP)) 

In [None]:
evaluate_algorithm_top(train, val, mf_model, data, at = 25)

<b>Export final csv File</b>

In [None]:
test = pd.read_csv('kaggle_baseline.csv')

In [None]:
import csv
from tqdm import tqdm

# open the file in the write mode
with open('solution_5.csv', 'w',encoding='UTF8') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(['user_id', 'prediction'])
    for user_id in tqdm(test.user_id.unique()):
      try:
          relevant_items = top_recomendations(mf_model, user_id,train,data,25)
          list_relevants = ' '.join([str(elem) for elem in relevant_items])
          writer.writerow([str(user_id),list_relevants])
      except KeyError as e:
          print(f"Error: {e}. User ID {user_id} not found.")


When using this method we had the following scores using different loss functions:
- 0.02496 when using MSE loss function
- 0.04864 when using BPR loss function

BPR Gave us the best result.

We have also tried to use different learning rate values to get better results:

- 0.03
- 0.01
- 0.005
- 0.001

We increased the Embedding size to see if we will get a better MAP score. We tried different values (10, 20, 25, 30, 35).

We also tried different Optimizers: Adam, Adagrad

Eventually, the best MAP score that we 've got is <b>0.04864</b>.