<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>

<i>Licensed under the MIT License.</i>

# Neural Collaborative Filtering on MovieLens dataset.

Neural Collaborative Filtering (NCF) is a well known recommendation algorithm that generalizes the matrix factorization problem with multi-layer perceptron. 

This notebook provides an example of how to utilize and evaluate NCF implementation in the `reco_utils`. We use a smaller dataset in this example to run NCF efficiently with GPU acceleration on a [Data Science Virtual Machine](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/).

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from reco_utils.common.timer import Timer
from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
from reco_utils.dataset import movielens
from reco_utils.common.notebook_utils import is_jupyter
from reco_utils.dataset.python_splitters import python_chrono_split
from reco_utils.evaluation.python_evaluation import (precision_recall_hr_map_ndcg_at_k, rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.6.11 | packaged by conda-forge | (default, Nov 27 2020, 18:57:37) 
[GCC 9.3.0]
Pandas version: 1.1.5
Tensorflow version: 1.15.4


Set the default parameters.

In [3]:
# top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 30
BATCH_SIZE = 256

SEED = 42

In [4]:
from tqdm.notebook import tqdm
import os
import json
import numpy as np
import pandas as pd
from scipy.spatial import distance


BASE_PATH = '../../content/drive/MyDrive/CSE547_Final_Project/ml-100k/'

COL_NAME_PREDICTION = "prediction"
COL_NAME_USER_ID = "userId"
COL_NAME_MOVIE_ID = "movieId"
COL_NAME_ITEM_ID = COL_NAME_MOVIE_ID
COL_NAME_RATING = "rating"
COL_NAME_MOVIE_RATINGS = "movies_and_ratings"


class Utility:
    
    def __init__(self, split_type, base_path=BASE_PATH, iteration=1):
        """Init the utility class
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        splits_and_paths = self.get_splits_and_paths()
        if split_type not in splits_and_paths:
          print(f"split_type must be one of {','.join(list(splits_and_paths.keys()))}")
        valid_iterations = set(range(1,6))
        if iteration not in valid_iterations:
          print(f"iteration must be one of {','.join(list(valid_iterations))}")
        if not os.path.isdir(base_path):
            raise OSError(f'{base_path} is not a directory')
        
        self.base_path = base_path
        self.anonymized_data_path = os.path.join(self.base_path, splits_and_paths[split_type], f"iter_{iteration}_fix")

        self.ratings_file_name =  'ratings.csv'

        self.training_file_name = 'ratings_train.csv'
        self.test_file_name = 'ratings_test.csv'
        self.validation_file_name = 'ratings_validation.csv'
        self.evaluation_file_name = 'evaluation.csv'
        self.user_movie_ratings_matrix_file_name = 'user_movie_ratings_matrix.csv'
        self.user_to_idx_file_name =  "user_to_idx.json"
        self.movie_to_idx_file_name =  "movie_to_idx.json"

        self.k_anonymized_postfix = '_anonymized.csv'
        self.k_anonymized_map_postfix = '_anonymized_idx_to_kanon_idx.json'
    
    def get_training_user_itemlist(self):
      training_data,_,_ = self.get_unanonymized_training_data()
      return training_data.groupby(COL_NAME_USER_ID)[COL_NAME_MOVIE_ID].agg(set)
  
    def get_test_user_itemlist(self):
      test_data = self.get_test_data()
      return test_data.groupby(COL_NAME_USER_ID)[COL_NAME_MOVIE_ID].agg(set)
  

    def generate_evaluation_dataframe(self, num_negative_items_to_sample_per_user=100):
      training_data = self.get_training_user_itemlist()
      test_data = self.get_test_user_itemlist()
      users = np.unique(np.concatenate([training_data.index.values, test_data.index.values]))
      all_items_to_consider = set(self.get_movie_to_col_index().keys())
      evaluation_df = self.get_test_data()
      rows_to_add = []
      for user in users:
          items_in_train = training_data.loc[user]
          items_in_test = test_data.loc[user]
          neg_items = all_items_to_consider - items_in_train - items_in_test
          neg_items = np.random.choice(list(neg_items), size=num_negative_items_to_sample_per_user, replace=False)
          rows_to_add.extend([{COL_NAME_USER_ID:user, COL_NAME_MOVIE_ID:item, COL_NAME_RATING:0.0} for item in neg_items])
      evaluation_df = evaluation_df.append(rows_to_add, True)
      return evaluation_df
    
    def get_evaluation_path(self):
      return os.path.join(self.anonymized_data_path,self.evaluation_file_name)

    def save_evaluation_df_to(self, df, overwrite=False):
      if os.path.exists(self.get_evaluation_path()) and not(overwrite):
        print("File exists, if you want to overwrite, then pass the arugment to")
        return
      df.to_csv(self.get_evaluation_path())
    
    def get_evaluation_data(self):
        return pd.read_csv(self.get_evaluation_path())

    def get_splits_and_paths(self):
        return {
            'ratings': 'ratings_split',
            'users': 'train_test_v2/rawdata_split'
        }

    def euclidean_distance(self, point1, point2):
        return np.linalg.norm(point1 - point2) ** 2
    
    def manhattan_distance(self, point1, point2):
        return abs(point1 - point2).sum()
    
    def jaccard_distance(self, point1, point2):
        return distance.jaccard(point1, point2)
  
    def find_closest_point(self, points_list, point, metric='euclidean'):

        min_distance = float('inf')
        min_distance_idx = -1
        i = 0
        for p in points_list:
            if metric == 'euclidean':
                distance = self.euclidean_distance(p, point)
            elif metric == 'manhattan':
                distance = self.manhattan_distance(p, point)
            elif metric == 'jaccard':
                distance = self.jaccard_distance(p, point)
            else:
                print('Function not implemented!')

            if distance < min_distance:
                min_distance = distance
                min_distance_idx = i
            
            i += 1
        
        return min_distance_idx

    def flatten_matrix_into_dataframe(self, matrix):
        value_vars = [v for v in matrix.columns.values if v != COL_NAME_USER_ID]
        DEFAULT_VARIABLE_NAME_IN_MELT="variable"
        DEFAULT_VALUE_NAME_IN_MELT="value"
        movies_dict = self.get_col_to_movie_index()
        matrix = matrix \
        .melt(id_vars=[COL_NAME_USER_ID], value_vars=value_vars) \
        .rename(columns={DEFAULT_VARIABLE_NAME_IN_MELT: COL_NAME_MOVIE_ID, DEFAULT_VALUE_NAME_IN_MELT: COL_NAME_RATING}) \
        .apply(lambda x : pd.to_numeric(x, downcast='integer'))
        matrix[COL_NAME_MOVIE_ID] = matrix[COL_NAME_MOVIE_ID].apply(lambda x: movies_dict[x])
        matrix[COL_NAME_USER_ID] = matrix[COL_NAME_USER_ID] + 1
        matrix = matrix[matrix[COL_NAME_RATING] > 0]
        return matrix

    def get_training_data_file_path(self, k:int):
      if k ==0:
        return os.path.join(self.anonymized_data_path, "user_movie_ratings_matrix.csv")
      return os.path.join(self.anonymized_data_path, f"{k}{self.k_anonymized_postfix}")

    def get_test_data_file_path(self):
      return os.path.join(self.anonymized_data_path, self.test_file_name)

    def get_training_data(self, k:int, path=None):
      path = path if path is not None else self.get_training_data_file_path(k)
      mldf = pd.read_csv(path, header=None if k!=0 else 0).reset_index().rename(columns={'index': COL_NAME_USER_ID})
      mldf= self.flatten_matrix_into_dataframe(mldf)
      return mldf, len(mldf[COL_NAME_USER_ID].unique()), len(mldf[COL_NAME_MOVIE_ID].unique())

    def get_training_data_for_evaluation(self):
        mldf = pd.read_csv(self.get_training_data_path())
        return self._group_and_get_sets_for_evaluation(mldf)

    def get_test_data(self):
        return pd.read_csv(self.get_test_data_file_path()).apply(lambda x : pd.to_numeric(x, downcast='integer'))

    def _group_and_get_sets_for_evaluation(self, df):
        testdf=df.sort_values(by=COL_NAME_RATING, ascending=False)
        grouped=testdf.groupby(COL_NAME_USER_ID).agg({COL_NAME_MOVIE_ID:lambda x: list(x), COL_NAME_RATING:lambda x: list(x)})
        grouped[COL_NAME_MOVIE_RATINGS] = grouped.apply(lambda x: list(zip(x[COL_NAME_MOVIE_ID], x[COL_NAME_RATING])), axis=1)
        grouped=grouped.drop(columns=[COL_NAME_RATING])
        return grouped

    def get_unanonymized_training_data(self):
      df = pd.read_csv(self.get_training_data_path())
      return df.apply(pd.to_numeric), len(df[COL_NAME_USER_ID].unique()), len(df[COL_NAME_MOVIE_ID].unique())

    def get_complete_data_path(self):
      return os.path.join(self.base_path, self.ratings_file_name)

    def get_complete_data(self):
      df = pd.read_csv(self.get_complete_data_path())
      return df.apply(pd.to_numeric), len(df[COL_NAME_USER_ID].unique()), len(df[COL_NAME_MOVIE_ID].unique())
  
    def get_evaluation_data_for_evaluation(self):
      df = self.get_evaluation_data()
      return self._group_and_get_sets_for_evaluation(df)
    
    def get_test_data_for_evaluation(self):
      testdf=self.get_test_data()
      return self._group_and_get_sets_for_evaluation(testdf)
  
    def get_training_data_path(self):
        """Get path to training file
        Keyword arguments
        base_path -- The directory where files are located. 
        """        
        return os.path.join(self.anonymized_data_path, self.training_file_name)       

    def get_test_data_path(self):
        """Get path to test file
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.test_file_name)
    
    def get_validation_data_path(self):
        """Get path to validation file
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.validation_file_name)
    
    def get_train_data_user_map_path(self):
        """Get path to user id to train index file map
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.user_to_idx_file_name)
    
    def get_train_data_movie_map_path(self):
        """Get path to movie id to train index file map
        Keyword arguments
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path, self.movie_to_idx_file_name)
   
    def _coalesce_path(self,path1, path2):
        return path1 if path1 != "" else path2

    def get_utility_matrix_from_train(self):      
        # df = pd.read_csv(file_path) # Bug fix -- Check the whole dataset now and map all movies
        df, _, _ = self.get_complete_data()
        
        movie_id_to_idx_dict = self.get_movie_to_col_index()

        users = df.userId.unique()
        test_user_dict = {users[i]: i for i in range(len(users))}

        num_users = len(df.userId.unique())
        num_movies = len(movie_id_to_idx_dict) + 1
        utility_matrix = np.zeros((num_users, num_movies))

        for index, row in df.iterrows():
            movie_id, rating = int(row[COL_NAME_MOVIE_ID]), float(row[COL_NAME_RATING])
            utility_matrix[test_user_dict[row[COL_NAME_USER_ID]]][0] = int(row[COL_NAME_USER_ID])
            
            if movie_id in movie_id_to_idx_dict:
                utility_matrix[test_user_dict[row[COL_NAME_USER_ID]]][movie_id_to_idx_dict[movie_id]] = rating
            #else:
            #   print(f'Movie ID {movie_id} not found!')
        
        return utility_matrix
    
    def get_msft_ncf_dataset(self, k):
        train,_,_ = self.get_training_data(k)
        test = self.get_test_data()
        return  NCFDataset(train=train, test=test, col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID, col_rating=COL_NAME_RATING, seed=SEED), train, test
        
    def get_k_anonymized_map_path(self, k, base_path=""):
        """Get path to k-anonymzied map. 
        {k}_anonymized_idx_to_kanon_idx.json --> the mapping from user index 
        (row in the ratings_train.csv) to the row index in the 
        corresponding {k}_anonymized.csv file
        Keyword arguments
        k -- 
        base_path -- The directory where files are located. 
        """
        return os.path.join(self.anonymized_data_path,f"{k}{self.k_anonymized_map_postfix}")
    
    def get_col_to_movie_index(self):
        movie_to_idx_path = self.get_train_data_movie_map_path()
        with open(movie_to_idx_path) as json_file:
            movie_id_to_idx_dict = json.load(json_file)
        return {v:int(k) for k,v in movie_id_to_idx_dict.items()}

    def get_movie_to_col_index(self):
        movie_to_idx_path = self.get_train_data_movie_map_path()
        with open(movie_to_idx_path) as json_file:
            movie_id_to_idx_dict = json.load(json_file)
        return {int(k):v for k,v in movie_id_to_idx_dict.items()}

    def get_feature_vector_for_user(self, movie_ratings:list):
        """Returns a vector, with the same dimentions as the 
        training dataset. 
        Keyword arguments
        movie_ratings -- A list of tuples. Each tuple should be
        as follows: (movie_id, rating)
        """
        movie_to_idx_path = self.get_train_data_movie_map_path()
        with open(movie_to_idx_path) as json_file:
            movie_id_to_idx_dict = json.load(json_file)
        
        feature_vec = [0 for _ in range(len(movie_id_to_idx_dict))]
        movie_not_found= 0
        for movie_id, rating in movie_ratings:
            movie_id, rating = int(movie_id), int(rating)

            if movie_id in movie_id_to_idx_dict.keys():
                feature_vec[movie_id_to_idx_dict[movie_id]] = rating
            else:
                movie_not_found += 1
                #print(f'Movie ID {movie_id} not found!')
        print(f"{movie_not_found} movies not found.")
        return feature_vec
       
    def get_col_to_user_index(self):
        id_to_idx_path = self.get_train_data_user_map_path()
        with open(id_to_idx_path) as json_file:
            id_to_idx_dict = json.load(json_file)
        return {int(k):v for k,v in id_to_idx_dict.items()}
    
    def get_col_to_kanoncol_index(self, k):
        idx_to_kidx_path = self.get_k_anonymized_map_path(k, self.base_path)
        with open(idx_to_kidx_path) as json_file:
            idx_to_kidx_path_dict = json.load(json_file)
        return {int(k):v for k,v in idx_to_kidx_path_dict.items()}

    def get_ks(self):
        return [0, 5,8,12,15,20,25]

    def get_validation_data(self):
        return pd.read_csv(self.get_validation_data_path())

    def get_base_path(k=None):
        return self.base_path
    
    def get_closest_k_cluster_to_user_id(self, k:int, metric="euclidean"):
        id_to_idx_dict = self.get_col_to_user_index()
        idx_to_kidx_path_dict = self.get_col_to_kanoncol_index(k) if k!=0 else None
        k_anon_data_path = self.get_training_data_file_path(k)
       

        k_anaon_data = pd.read_csv(k_anon_data_path, sep=',', header=None).apply(pd.to_numeric).values

        u_matrix = self.get_utility_matrix_from_train()

        user_to_cluster_dict = {}
        for um in tqdm(u_matrix):
            user_id = int(um[0])
            row_vec = um[1:]
        
            # Easy case - We have trained on this user before
            # Just need to lookup, to see what cluster they belong to
            if user_id in id_to_idx_dict and\
            k!=0 and id_to_idx_dict[user_id] in idx_to_kidx_path_dict:            
                user_to_cluster_dict[user_id] = idx_to_kidx_path_dict[id_to_idx_dict[user_id]] + 1
            elif user_id in id_to_idx_dict and k==0: # user maps to themselves, they were in training data
                user_to_cluster_dict[user_id] = id_to_idx_dict[user_id] + 1
            else:
              #print(f'{user_id} not found in training data!')
              user_to_cluster_dict[user_id] = self.find_closest_point(k_anaon_data, row_vec, metric=metric) + 1
              #print(f'{user_id} mapped to {user_to_cluster_dict[user_id]}')
        
        return user_to_cluster_dict
      
    
def avg_mahalanobis_dist(user_movie_matrix, anon_matrix, useridx_to_cluster):
    ##user_movie_matrix and anon_matrix are numpy arrays
    #useridx_to_cluster is a dictionary mapping the index to a cluster
    d = dict()
    stdev = np.std(anon_matrix, axis = 0)
    for u, user in enumerate(user_movie_matrix):
        ##some users are not mapped to clusters
        if u in useridx_to_cluster.keys():
            cluster_idx = idx_to_cluster[u]
            cluster = anon_matrix[cluster_idx]
            d[u] = mahalanobis_dist(user, cluster, stdev)
    return np.mean(list(d.values()))

In [5]:
BASE_OUTPUT_FOLDER = "/home/fastuser/msft_ncf_output/"
MODEL_TYPE="NeuMF"

def get_recommendations_per_user(predictions, split_type, iteration, k, dist):
    util = Utility(split_type=split_type, iteration=iteration)
    user_to_cluster = util.get_closest_k_cluster_to_user_id(k, metric=dist) 
    actual_user_col = f"{COL_NAME_USER_ID}_actual"
    user_to_cluster_df = pd.DataFrame.from_dict([{COL_NAME_USER_ID:cluster, actual_user_col:user} for user,cluster in user_to_cluster.items()])
    user_to_cluster_df.set_index(COL_NAME_USER_ID, inplace=True) 
    predictions.set_index(COL_NAME_USER_ID, inplace=True)
    preds = predictions.join(user_to_cluster_df, how="inner")
    preds.reset_index(inplace=True)
    preds.drop(columns=COL_NAME_USER_ID,inplace=True)
    preds.rename(columns={actual_user_col:COL_NAME_USER_ID}, inplace=True)
    return preds

def get_base_output_folder(split_type, iteration, k, dist=None, additional_str=None):
    base = os.path.join(BASE_OUTPUT_FOLDER, split_type, f"iteration_{iteration}", str(k))
    if dist is not None:
        base = os.path.join(base, dist)
    if additional_str is not None:
        base = os.path.join(base, additional_str)
    return base

def get_output_folder_path_for_model(split_type, iteration, k, additional_str=None):
    base = get_base_output_folder(split_type,iteration,k, additional_str)
    return os.path.join(base, "model")

def does_model_exist(split_type, iteration, k, additional_str=None):
    return os.path.exists(get_output_folder_path_for_model(split_type, iteration, k, additional_str))

def save_model(ncf, split_type, iteration, k, additional_str=None):
    path = get_output_folder_path_for_model(split_type, iteration, k, additional_str)
    ncf.save(path)
    print(f"Saved to: {path}")

def load_model(ncf, split_type, iteration, k, additional_str=None):
    ncf.load(neumf_dir=get_output_folder_path_for_model(split_type, iteration, k, additional_str))
    return ncf

def get_new_model(data,
                  model_type=MODEL_TYPE,
                  n_factors=4,
                  layer_sizes=[16,8,4],
                  n_epochs=EPOCHS,
                 batch_size=BATCH_SIZE,
                 lr=1e-3):
    model = NCF (
        n_users=data.n_users, 
        n_items=data.n_items,
        model_type=model_type,
        n_factors=n_factors,
        layer_sizes=layer_sizes,
        n_epochs=n_epochs,
        batch_size=batch_size,
        learning_rate=lr,
        verbose=10,
        seed=SEED
    )
    model.set_indices(data)
    return model

def get_dataset(split_type, iteration, k):
    return Utility(split_type=split_type, iteration=iteration).get_msft_ncf_dataset(k)

def predict(model, train):
    with Timer() as test_time:
        users, items, preds = [], [], []
        item = list(train[COL_NAME_MOVIE_ID].unique())
        for user in train[COL_NAME_USER_ID].unique():
            user = [user] * len(item) 
            users.extend(user)
            items.extend(item)
            preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={COL_NAME_USER_ID: users, COL_NAME_MOVIE_ID:items, COL_NAME_PREDICTION:preds})

    merged = pd.merge(train, all_predictions, on=[COL_NAME_USER_ID, COL_NAME_MOVIE_ID], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop(COL_NAME_RATING, axis=1)

    print("Took {} seconds for prediction.".format(test_time))
    return all_predictions

import pathlib

    
class FileSaver(object):
    def __init__(self, file_name, save_func, load_func, base_path_func=get_base_output_folder):
        self.base_path_func = base_path_func
        self.file_name = file_name
        self.save_func = save_func
        self.load_func = load_func
    
    def get_file_path(self, split_type, iteration, k, dist=None, additional_str=None):
        base = self.base_path_func(split_type,iteration,k, dist, additional_str)
        print(base)
        pathlib.Path(base).mkdir(parents=True, exist_ok=True)
        file_path = os.path.join(base, self.file_name)
        return file_path

    def save(self, item, split_type, iteration, k, dist=None, additional_str=None):
        path = self.get_file_path(split_type, iteration, k, dist, additional_str)
        self.save_func(item, path)
        print(f"Saved item to {path}")
    
    def load(self, split_type, iteration, k, item=None , dist=None, additional_str=None):
        return self.load_func(item, self.get_file_path(split_type, iteration, k, dist, additional_str))
    
    def does_exist(self, split_type, iteration, k, dist=None, additional_str=None):
        return os.path.exists(self.get_file_path(split_type, iteration, k, dist, additional_str))

def evaluate_from_cluster(cluster_predictions, split_type, iteration, k, dist_type, top_k=TOP_K, data_eval='test'):
    user_preds = get_recommendations_per_user(cluster_predictions, split_type, iteration,k, dist_type)
    return evaluate(user_preds, split_type, iteration, top_k=top_k, data_eval=data_eval)
    
def evaluate(predictions, split_type, iteration, top_k=TOP_K, data_eval='test'):
    test = None
    util = Utility(split_type=split_type, iteration=iteration)
    if data_eval == 'test':
        test = util.get_test_data()
    elif data_eval == 'validation':
        test = util.get_validation_data()
    else:
        raise Exception('Invalid data_eval type')
    eval_precision, eval_recall, eval_hr, eval_map, eval_ndcg = precision_recall_hr_map_ndcg_at_k(test, predictions, col_prediction='prediction', col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID, k=top_k)
    
    print("MAP:\t%f" % eval_map,
          "NDCG:\t%f" % eval_ndcg,
          "Precision@K:\t%f" % eval_precision,
          "Recall@K:\t%f" % eval_recall,
          "HR@K:\t%f" % eval_hr, sep='\n') 
    return eval_map, eval_ndcg, eval_precision, eval_recall, eval_hr
    
def get_df_filesaver(file_name):
    return FileSaver(
    file_name,\
    save_func=lambda i,f: i.to_csv(f),\
    load_func=lambda i,f: pd.read_csv(f, index_col=0))


ClusterPredictionSaver = get_df_filesaver('cluster_level_predictions.csv')
RatingsParamsSaver = get_df_filesaver('ratings_paramsv2.csv')
UsersParamsSaver = get_df_filesaver('users_paramsv2.csv')





In [6]:
class Params(object):
    def __init__(self, hidden_layers, num_factors):
        self.hidden_layers = hidden_layers
        self.num_factors = num_factors
    
    def __str__(self):
        return f"{str(self.hidden_layers)} | num_factors={self.num_factors}"


In [None]:


params_to_search = []
hidden_layers = [\
    [16,8,4],\
    [32,16,8],\
    [64,32,16],\
    [32,16,8,4],\
    [64,32,16,8],\
    [128,64,32,16],\
]
factors = [4,8,16]
def search_for_best_model(hidden_layers, factors, split_type, iteration, k, out_file, rerun=False):
    for num_factors in factors:
        for hidden_layer in hidden_layers:
            params_to_search.append(Params(hidden_layer, num_factors))
    data, train,test = get_dataset(split_type, iteration, k)
    model = None
    best_ndcg = 0.0
    best_params = None
    results = pd.DataFrame(columns=['iteration', 'k', 'split_type', 'distance_metric', 'map@10', 'ndcg@10', 'precision@10', 'recall@10', 'hr@10', 'num_factors','hidden_layers'])
    if os.path.exists(out_file):
        results = pd.read_csv(out_file)
    for params in params_to_search:
        if (len(results[(results['num_factors'] == params.num_factors) & (results['hidden_layers'] == str(params.hidden_layers))]) > 0) and not rerun:
            print(f"Already have results for {params}")
            display(results[(results['num_factors'] == params.num_factors) & (results['hidden_layers'] == str(params.hidden_layers))])
            continue
        model = get_new_model(data, n_factors=params.num_factors, layer_sizes=params.hidden_layers)
        with Timer() as train_time:
            model.fit(data)
        print("Took {} seconds for training.".format(train_time))
        predictions = predict(model, train)
        user_predictions = get_recommendations_per_user(predictions, split_type,  iteration, k, 'jaccard')
        predictions.reset_index(inplace=True)
        meanap, ndcg, precision, recall, hr = evaluate(user_predictions, split_type,  iteration, 10, data_eval='validation')
        result = {'iteration':iteration ,\
                  'k':k,\
                  'split_type':split_type,\
                  'distance_metric':'jaccard',\
                  'map@10':meanap,\
                  'ndcg@10':ndcg,\
                  'precision@10':precision,\
                  'recall@10':recall,\
                  'hr@10':hr,\
                  'num_factors':params.num_factors,\
                  'hidden_layers':str(params.hidden_layers)}
        results = results.append([result])
        results.to_csv(out_file, index=False)
        if ndcg > best_ndcg:
            best_model = model
            best_ndcg = ndcg
            best_params = params
    return  best_params, results

best_params_ratings, results_ratings = search_for_best_model(hidden_layers, factors, 'ratings', 1, 0, 'ratings_params_searchv2.csv')
best_params_users, results_users = search_for_best_model(hidden_layers, factors, 'users', 1, 0, 'users_params_searchv2.csv')

RatingsParamsSaver.save(results_ratings, 'ratings', 1, 0)
UsersParamsSaver.save(results_users, 'users', 1, 0)

print(f"best_params_ratings={best_params_ratings}")
display(results_ratings)

print(f"best_params_users={best_params_users}")
display(results_users)

Already have results for [16, 8, 4] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
0,1,0,ratings,jaccard,0.031876,0.105749,0.082471,0.089208,0.48414,4,"[16, 8, 4]"


Already have results for [32, 16, 8] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
1,1,0,ratings,jaccard,0.026689,0.089022,0.074124,0.079907,0.459098,4,"[32, 16, 8]"


Already have results for [64, 32, 16] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
2,1,0,ratings,jaccard,0.029445,0.091467,0.072621,0.086089,0.452421,4,"[64, 32, 16]"


Already have results for [32, 16, 8, 4] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
3,1,0,ratings,jaccard,0.033657,0.096842,0.075459,0.083183,0.435726,4,"[32, 16, 8, 4]"


Already have results for [64, 32, 16, 8] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
4,1,0,ratings,jaccard,0.034227,0.094929,0.069616,0.087566,0.444073,4,"[64, 32, 16, 8]"


Already have results for [128, 64, 32, 16] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
5,1,0,ratings,jaccard,0.025611,0.086881,0.073289,0.078248,0.45576,4,"[128, 64, 32, 16]"


Already have results for [16, 8, 4] | num_factors=8


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
6,1,0,ratings,jaccard,0.035406,0.104673,0.078297,0.091833,0.48414,8,"[16, 8, 4]"


Already have results for [32, 16, 8] | num_factors=8


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
7,1,0,ratings,jaccard,0.033353,0.101581,0.078798,0.091662,0.475793,8,"[32, 16, 8]"


Already have results for [64, 32, 16] | num_factors=8


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
8,1,0,ratings,jaccard,0.03538,0.106295,0.082805,0.093761,0.504174,8,"[64, 32, 16]"


Already have results for [32, 16, 8, 4] | num_factors=8


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
9,1,0,ratings,jaccard,0.030376,0.09621,0.076795,0.089038,0.467446,8,"[32, 16, 8, 4]"


Already have results for [64, 32, 16, 8] | num_factors=8


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
10,1,0,ratings,jaccard,0.030796,0.092088,0.070952,0.0822,0.440735,8,"[64, 32, 16, 8]"


Already have results for [128, 64, 32, 16] | num_factors=8


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
11,1,0,ratings,jaccard,0.026626,0.083259,0.065275,0.068872,0.417362,8,"[128, 64, 32, 16]"


Already have results for [16, 8, 4] | num_factors=16


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
12,1,0,ratings,jaccard,0.035576,0.104909,0.078464,0.094769,0.487479,16,"[16, 8, 4]"


Already have results for [32, 16, 8] | num_factors=16


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
13,1,0,ratings,jaccard,0.038106,0.11416,0.085142,0.098378,0.500835,16,"[32, 16, 8]"


Already have results for [64, 32, 16] | num_factors=16


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
14,1,0,ratings,jaccard,0.028814,0.093968,0.074791,0.083259,0.465776,16,"[64, 32, 16]"


Already have results for [32, 16, 8, 4] | num_factors=16


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
15,1,0,ratings,jaccard,0.032208,0.094592,0.070785,0.082546,0.440735,16,"[32, 16, 8, 4]"


Already have results for [64, 32, 16, 8] | num_factors=16


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
16,1,0,ratings,jaccard,0.029648,0.091412,0.071452,0.078104,0.434057,16,"[64, 32, 16, 8]"


Already have results for [128, 64, 32, 16] | num_factors=16


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
17,1,0,ratings,jaccard,0.029005,0.09615,0.075793,0.087648,0.464107,16,"[128, 64, 32, 16]"


  % min_num


Already have results for [16, 8, 4] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
0,1,0,users,jaccard,0.010458,0.203583,0.191803,0.020442,0.672131,4,"[16, 8, 4]"


Already have results for [32, 16, 8] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
1,1,0,users,jaccard,0.005245,0.134338,0.12623,0.012661,0.491803,4,"[32, 16, 8]"


Already have results for [64, 32, 16] | num_factors=4


Unnamed: 0,iteration,k,split_type,distance_metric,map@10,ndcg@10,precision@10,recall@10,hr@10,num_factors,hidden_layers
2,1,0,users,jaccard,0.003275,0.085025,0.096721,0.009264,0.393443,4,"[64, 32, 16]"


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 303.7882 seconds for training.
Took 8.7640 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.006180
NDCG:	0.133256
Precision@K:	0.140984
Recall@K:	0.014504
HR@K:	0.459016


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 303.5973 seconds for training.
Took 8.8520 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.005362
NDCG:	0.128055
Precision@K:	0.131148
Recall@K:	0.011103
HR@K:	0.409836


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 310.9272 seconds for training.
Took 9.0618 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.002194
NDCG:	0.064814
Precision@K:	0.073770
Recall@K:	0.007995
HR@K:	0.344262


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 289.3633 seconds for training.
Took 8.5628 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.010182
NDCG:	0.163914
Precision@K:	0.155738
Recall@K:	0.020518
HR@K:	0.573770


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 295.1464 seconds for training.
Took 8.7139 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.007399
NDCG:	0.116433
Precision@K:	0.119672
Recall@K:	0.016463
HR@K:	0.524590


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 293.4897 seconds for training.
Took 8.7332 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.008955
NDCG:	0.166861
Precision@K:	0.168852
Recall@K:	0.020826
HR@K:	0.557377


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 298.6383 seconds for training.
Took 8.6888 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.005349
NDCG:	0.123881
Precision@K:	0.122951
Recall@K:	0.013189
HR@K:	0.524590


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 302.5008 seconds for training.
Took 8.7904 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.004655
NDCG:	0.104374
Precision@K:	0.108197
Recall@K:	0.011081
HR@K:	0.491803


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 309.0234 seconds for training.
Took 8.8757 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.005575
NDCG:	0.115430
Precision@K:	0.116393
Recall@K:	0.013425
HR@K:	0.491803


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 292.3298 seconds for training.
Took 8.6653 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.010513
NDCG:	0.135245
Precision@K:	0.134426
Recall@K:	0.017466
HR@K:	0.508197


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 291.1830 seconds for training.
Took 8.9016 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.004143
NDCG:	0.109323
Precision@K:	0.101639
Recall@K:	0.009612
HR@K:	0.442623


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 292.6480 seconds for training.
Took 8.9483 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.004844
NDCG:	0.110530
Precision@K:	0.111475
Recall@K:	0.011873
HR@K:	0.475410


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

Took 297.1484 seconds for training.
Took 8.8157 seconds for prediction.


  0%|          | 0/610 [00:00<?, ?it/s]

MAP:	0.006152
NDCG:	0.123391
Precision@K:	0.129508
Recall@K:	0.011352
HR@K:	0.426230


Training NeuMF Epochs:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
#train, test = python_chrono_split(df, 0.75)
train = Utility('ratings', iteration=1).get_training_data(0)[0]
test=Utility('ratings', iteration=1).get_test_data()
test.head(5)

In [None]:
train.head(5)

Generate an NCF dataset object from the data subsets.

In [None]:
data = NCFDataset(train=train, test=test, col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID, col_rating=COL_NAME_RATING, seed=SEED)

### 3. Train the NCF model on the training data, and get the top-k recommendations for our testing data

NCF accepts implicit feedback and generates prospensity of items to be recommended to users in the scale of 0 to 1. A recommended item list can then be generated based on the scores. Note that this quickstart notebook is using a smaller number of epochs to reduce time for training. As a consequence, the model performance will be slighlty deteriorated. 

In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

In [None]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time))

In the movie recommendation use case scenario, seen movies are not recommended to the users.

In [None]:
import importlib
importlib.

In [None]:
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train[COL_NAME_MOVIE_ID].unique())
    for user in train[COL_NAME_USER_ID].unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={COL_NAME_USER_ID: users, COL_NAME_MOVIE_ID:items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=[COL_NAME_USER_ID, COL_NAME_MOVIE_ID], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop(COL_NAME_RATING, axis=1)

print("Took {} seconds for prediction.".format(test_time))

### 4. Evaluate how well NCF performs

The ranking metrics are used for evaluation.

In [None]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID,k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction',col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID, k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID, k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', col_user=COL_NAME_USER_ID, col_item=COL_NAME_MOVIE_ID, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

In [None]:
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time))

eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

In [None]:
if is_jupyter():
    # Record results with papermill for tests
    import papermill as pm
    import scrapbook as sb
    sb.glue("map", eval_map)
    sb.glue("ndcg", eval_ndcg)
    sb.glue("precision", eval_precision)
    sb.glue("recall", eval_recall)
    sb.glue("train_time", train_time.interval)
    sb.glue("test_time", test_time.interval)