Loading & Preprocessing MovieLens Dataset

---

In [1]:
import pandas as pd
import numpy as np

# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols

movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

# Compute the number of movies to which a genre is assigned.
genre_occurences = movies[genre_cols].sum().to_dict()

# Since some movies can belong to more than one genre, we create different
# 'genre' columns as follows:
# - all_genres: all the active genres of the movie.
# - genre: randomly sampled from the active genres.
def mark_genres(movies, genres):
  def get_random_genre(gs):
    active = [genre for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return 'Other'
    return np.random.choice(active)
  def get_all_genres(gs):
    active = [genre for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return 'Other'
    return '-'.join(active)
  movies['genre'] = [
      get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
  movies['all_genres'] = [
      get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

mark_genres(movies, genre_cols)

# Create one merged DataFrame containing all the movielens data.
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')

# Utility to split the data into training and test sets.
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

Downloading movielens data...
Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


Data PreProcessing

---



In [2]:
#Restructure the movielens dataframe into three input formats expected by the recommender

# ml_item_feat is a unique listing of each movie (item) and the features associated with the item. 
#    In this case all features are genres, but features can be any number
ml_item_feat = movielens[['movie_id', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 
       'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].drop_duplicates()

# Rename 'movie_id' to 'item_id', as the model expectes the primary key to be 'item_id'
ml_item_feat.columns = ['item_id', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 
       'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Create another data frame for user features, in this example only age is used by any number can be used
ml_user_feat = movielens[['user_id', 'age']].drop_duplicates()

# Create a short data frame that only contains the relationships between which users interacted with which item
ml_short_df = movielens[['user_id', 'movie_id', 'rating', 'unix_timestamp']]

# Rename to columns to show the expected inputs
ml_short_df.columns = ['user_id', 'item_id', 'rating', 'time']

Model Arguments 

---


In [3]:
# A class used to save all the arguments used for a model, with a function to log arguments used
class DummyArgs(object):

    def __init__(self):

        # Core args
        self.filename = ml_short_df #name of pandas df (preformatted)
        self.model = "PRME" # Currently supports "PRME", "TransFM"
        self.features = "full_features" # Currently only supports "full_features", TODO: add "time"
        self.item_df = ml_item_feat # The item pandas df (already formatted)
        self.user_df = ml_user_feat # Already formatted user data 
        self.user_region_df = "none" # A data frame that restricts items and users to specific regions

        # Training args
        self.min_epoch = 30 # Minimum number of epochs, model will not stop even if accuracy is decreasing
        self.eval_freq = 10 # Frequency at which to evaluate model
        self.max_iters = 100 # Epoch to end training, even if model is still improving
        self.quit_delta = 40 # Number of iterations at which to quit if no improvement
        self.num_train_samples = 3 # Number of negative samples to evaluate against to calc auc
        self.num_val_samples = 10 # Number of negative samples to compare against positive sample in val
        self.num_test_samples = 10 # Recommended to be same as val
        self.val_set = 1 # 1 to include validation set, 0 to only use train/test (skipping val set is very prone to overfitting)
        self.weighted_sampling = 0 #set negative samples to be weighted according to observations in train

        # Model args
        self.num_dims = 3 # Model dimensionality
        self.linear_reg = 6.236 #L2 regularization: linear regularization
        self.emb_reg = 10.54 #L2 regularization: embedding regularization
        self.trans_reg = 3.135 #from first pass #L2 regularization: translation regularization
        self.init_mean = 0.133 
        self.starting_lr = 0.05 # Learning rate of model at epoch 1
        self.lr_decay_factor = 1.33 # Decay factor for learning rate
        self.lr_decay_freq = 700 # Frequency at which to decay learning rate
        self.user_min = 2 # Minimum number of interactions for a user to be included in model
        self.item_min = 3 # Minimum number of interactions for an item to be included in model
        self.secondary_reg_scale = 1 # Scale loss inside sigmoid function, does nothing if set to 1

        # Debug args
        self.verbosity = 1 # 2 for maximum verbosity, 0 supressess all
        self.random_seed = 1 # Set the random seed to the model, useful for debugging
        self.log_cache = list() # Init empty log 

        # Deployment args
        self.return_k_preds = 100 # Number of predictions to return per user
        self.deploy_preds = 1 # 0 disables the generation of deployment predictions for better performance, 1 to enable
         
    # A function to store input text to log_cache as well as print if the input verbosity_max is greater than the 
    # verbosity of the 
    def logger(self, input_text, verbosity_max = 1):
        self.log_cache.append(input_text)
        if self.verbosity >= verbosity_max:
            print(input_text)
            
# Default args, set as something other than a command line implementation
args = DummyArgs()

Dataset Object

---

In [4]:
import scipy.sparse as sp
import numpy as np
import pandas as pd
from collections import defaultdict
import copy
import random

# Class to represent a dataset
class Dataset:
    def __init__(self, args):
        
        df = args.filename
        self.args = args
        
        # Use random seed so that future runs with the same params are deterministic 
        np.random.seed(self.args.random_seed)
        
        print('First pass')
        print('\tnum_users = ' + str(len(df['user_id'].unique())))
        print('\tnum_items = ' + str(len(df['item_id'].unique())))
        print('\tdf_shape  = ' + str(df.shape))
        
        user_counts = df['user_id'].value_counts()
        print('Collected user counts...')
        item_counts = df['item_id'].value_counts()
        print('Collected item counts...')
        
        # Filter based on user and item counts
        df = df[df.apply(lambda x: user_counts[x['user_id']] >= self.args.user_min, axis=1)]
        print('User filtering done...')
        df = df[df.apply(lambda x: item_counts[x['item_id']] >= self.args.item_min, axis=1)]
        print('Item filtering done...')
        
        print('Second pass')
        self.args.logger('\tnum_users = ' + str(len(df['user_id'].unique())), 1)
        self.args.logger('\tnum_items = ' + str(len(df['item_id'].unique())), 1)
        self.args.logger('\tdf_shape  = ' + str(df.shape), 1)
        
        # If restricting users to certain regions, assign regional mapping dataframe to dataset
        if type(self.args.user_region_df) == pd.DataFrame:
            
            # Assign both dataframes as attributes of dataset
            self.user_region_df = self.args.user_region_df
            self.item_region_df = self.args.item_df[['item_id', 'region_id']]
            
        # Either way, make sure 'region_id' is no longer in item_df and set as dataset attribute
        self.item_df = self.args.item_df.drop(columns = ['region_id'], errors='ignore')
        
        # Original code normalized temporal values here
        
        print('Constructing datasets...')
        training_set = defaultdict(list)
        # Start counting users and items at 1 to facilitate sparse matrix computation
        # NOTE: this means these dictionaries will NOT be 0 indexed, careful tracking 
        #    of 0-indexed series is needed for development  
        num_users = 0
        num_items = 0
        item_to_idx = {}
        user_to_idx = {}
        idx_to_item = {}
        idx_to_user = {}
        
        # Iterate through items, creating dense dicts for item and users
        for row in df.itertuples():
            
            # New item
            if row.item_id not in item_to_idx:
                item_to_idx[row.item_id] = num_items
                idx_to_item[num_items] = row.item_id
                num_items += 1
                
            # New user
            if row.user_id not in user_to_idx:
                user_to_idx[row.user_id] = num_users
                idx_to_user[num_users] = row.user_id
                num_users += 1
                
            # Converts all ratings to positive implicit feedback
            training_set[user_to_idx[row.user_id]].append(
                    (item_to_idx[row.item_id], row.time))
        
        # Save item and use count as attributes of dataset
        self.num_users = num_users
        self.num_items = num_items
        
        # Sort training_set by time, so each users' observations are in order
        for user in training_set:
            training_set[user].sort(key=lambda x: x[1])
        
        # Create dictionaries for user to region and region to user
        if type(self.args.user_region_df) == pd.DataFrame:
            
            # Define default values to fall back on for users without regions                
            default_item_list = range(self.num_items)
            default_num_items = self.num_items
            
            # Only include users with enough data to be included after prefiltering
            active_user_mask = self.args.user_region_df['user_id'].isin(df['user_id'].unique())
            user_region_df = self.args.user_region_df.loc[active_user_mask]
            
            # Start with empty dictionaries for region lookups
            item_to_region = {}
            region_to_item = {}
            user_to_region_item_idx = {}
            user_to_num_items = {}
            region_str_to_items = {}
            
            # Iterate over tall input data where each row is a single item, region pair
            for row in self.item_region_df.itertuples():
                
                # Building dict with item_id as key, and value is that item's region
                item_to_region[row.item_id] = [int(row.region_id)]
                
                # Check if this item is being used in dataset (has adequate data)
                if item_to_idx.get(row.item_id):
                
                    # Building dict with region_id as key, and value is list of items in region
                    if row.region_id not in region_to_item:

                        # If this row is a new region then add entry to dict, where value is list with 1 value
                        region_to_item[row.region_id] = [item_to_idx.get(row.item_id)] 
                    else:

                        # If region already in dict, append new item_id to the value
                        region_to_item[row.region_id].append(item_to_idx.get(row.item_id))  
            
            # Iterate through user region pairs, and find all items in those regions
            for row in user_region_df.itertuples():
                
                # Building dict with cached_region_ids as key and all items for that region as value
                if row.cached_region_ids not in region_str_to_items:
                    
                    # Unpack string into list of region_ids
                    this_region_list = row.cached_region_ids.split(",")
                    
                    # Create empty list of all items in list of regions 
                    this_region_list_items = []
                    
                    # Iterate over regions and add all items to list
                    for this_region in this_region_list:
                        
                        # Adds all items associated with this_region, adds nothing if key not found
                        this_region_list_items.extend(region_to_item.get(int(this_region), []))
                        
                    # If all regions in list collectively have 0 items, replace with default list 
                    if this_region_list_items == []:
                        this_region_list_items = default_item_list
                    
                    # Add the full list to the dictionary
                    region_str_to_items[row.cached_region_ids] = this_region_list_items
                
                # Translate the user_id to the user_idx
                this_user_idx = user_to_idx[row.user_id] # Should this be a .get? what to default?
                
                # Building dict with user_id as key and all items associated with their regions as value
                user_to_region_item_idx[this_user_idx] = region_str_to_items.get(row.cached_region_ids, default_item_list)
                
                # Building a dict with user_id as key and number of items associated with their regions as value
                user_to_num_items[this_user_idx] = len(region_str_to_items.get(row.cached_region_ids, default_num_items))

            # Populate all users without region data with all items
            for user_idx in training_set:
                
                if user_idx not in user_to_region_item_idx:
                    user_to_region_item_idx[user_idx] = default_item_list
                    user_to_num_items[user_idx] = default_num_items
        
        # Create deep copy of training set before removing test and val for deployment
        deploy_set = copy.deepcopy(training_set)
        
        # Init lists of datasets
        training_times = {}
        
        # Only init val lists if they'll be used
        if self.args.val_set == 1:
            val_set = {} 
            val_times = {}
        
        test_set = {}
        test_times = {}
        # Map from user to set of items for easy lookup
        item_set_per_user = {}
        
        if self.args.val_set == 1:
          print("Trying to structure dataset with validation set, this isn't fully supported. ")

        for user in training_set:
            
            # If user has inadequate data for train/test split, use dummy values. (if val_set == 0, only 1 train, 1 test needed)
            if len(list(training_set[user])) < (2 + self.args.val_set):
                
                # Reviewed < 3 items, insert dummy values
                test_set[user] = (-1, -1)
                test_times[user] = (-1, -1)
                
                if self.args.val_set == 1:
                    val_set[user] = (-1, -1)
                    val_times[user] = (-1, -1)
                
            # User has adequate data, populate train, val, and test data
            else:
                
                # No validation set needed, only populate train/test
                if self.args.val_set == 0:
                    
                    # Remove last item from train to serve as test
                    test_item, test_time = training_set[user].pop() 
                    
                    # This lookback is what requires the starting at 1 indexing
                    last_item, last_time = training_set[user][-1] 
                    
                    # Test item is the most recent item by user, last item is the one previous
                    test_set[user] = (test_item, last_item) 
                    
                    # Time functionality currently not supported
                    test_times[user] = (test_time, last_time)
                
                #note: currently not well maintained, will need adjustments to work with some new functionality 
                else:
                    test_item, test_time = training_set[user].pop() #remove last item from train to serve as test
                    val_item, val_time = training_set[user].pop() #remove second to last item from train to serve as val
                    last_item, last_time = training_set[user][-1] #this lookback is what requires the starting at 1 indexing 
                    test_set[user] = (test_item, val_item) #test item is the most recent item by user, val item is the one previous
                    test_times[user] = (test_time, val_time)
                    val_set[user] = (val_item, last_item) #val item is second to last item by usaer, 'last item' is third to last (last in training set) 
                    val_times[user] = (val_time, last_time)
                    
            # Separate timestamps and create item set
            training_times[user] = copy.deepcopy(training_set[user])
            training_set[user] = [x[0] for x in training_set[user]]
            item_set_per_user[user] = set(training_set[user])
            
        # Iterate over users to get total count of training items
        num_train_items = 0
        for user in training_set:
            num_train_items += len(list(training_set[user]))

        # Set newly created datasets, dictionaries, and counts as dataset attributes
        self.deploy_set = deploy_set
        # self.deploy_times = deploy_times
        
        self.training_set = training_set
        self.training_times = training_times
        
        if self.args.val_set == 1:
            self.val_set = val_set
            self.val_times = val_times
        
        self.test_set = test_set
        self.test_times = test_times
        self.item_set_per_user = item_set_per_user

        self.item_to_idx = item_to_idx
        self.user_to_idx = user_to_idx
        self.idx_to_item = idx_to_item
        self.idx_to_user = idx_to_user
        
        if type(self.args.user_region_df) == pd.DataFrame:
            self.item_to_region = item_to_region
            self.region_to_item = region_to_item
            self.user_to_region_item_idx = user_to_region_item_idx
            self.region_str_to_items = region_str_to_items
            self.user_to_num_items = user_to_num_items
            self.user_region_df = user_region_df

        
        self.num_train_items = num_train_items


        # Full_features replaced 'content'
        if self.args.features == 'full_features':
          
            # Place index on users df and set as attribute of dataset
            print('Reading user demographics...')
            user_df = self.args.user_df
            user_df = user_df.set_index('user_id')
            self.user_df = user_df

            # Create dictionary to build sparse user feature matrix
            self.orig_indices = []
            for i in range(1, self.num_users):
                self.orig_indices.append(self.idx_to_user[i])
            self.user_feats = sp.csr_matrix(user_df.loc[self.orig_indices].values)
          
            # Repeat above for items instead of users
            print('Reading item demographics...')
            self.item_df = self.item_df.set_index('item_id')
            self.orig_item_indices = []
            for i in range(1, self.num_items):
                self.orig_item_indices.append(self.idx_to_item[i])
            self.item_feats = sp.csr_matrix(self.item_df.loc[self.orig_item_indices].values)
        
        else:
            self.user_feats = None
            self.item_feats = None
            
        # Create scipy.sparse matrices #NOTE: This is where indexing fix matters
        self.user_one_hot = sp.identity(self.num_users - 0).tocsr()
        self.item_one_hot = sp.identity(self.num_items - 0).tocsr()
        
        for user in deploy_set:

            # Separate timestamps and create item set
            deploy_set[user] = [x[0] for x in deploy_set[user]]
        
        # Sparse training matrices
        train_rows = []
        train_cols = []
        train_vals = []
        train_prev_vals = []
        train_times = []
        train_prev_times = []
        
        # Init list for generating negative samples #note - must happen after .pop() to avoid leakage
        weighted_item_list = []
        
        # Restructure data into training rows with previous items as feature
        for user in self.training_set:

            # Start with 1st item instead of 0th item of training data to allow for prev item reference
            for i in range(1, len(list(self.training_set[user]))):
                
                item = self.training_set[user][i]
                item_prev = self.training_set[user][i-1]
                item_time = self.training_times[user][i]
                item_prev_time = self.training_times[user][i-1]
                train_rows.append(user)
                train_cols.append(item)
                train_vals.append(1)
                train_prev_vals.append(item_prev)
                train_times.append(item_time[1])
                train_prev_times.append(item_prev_time[1])
                
                # Add one observation to weighted item list
                weighted_item_list.append(item) 
                
        # Normalize values then set weights as attribute for negative sampling
        self.weighted_item_list = weighted_item_list
        
        # Determine mean and std to normalize timestamps
        self.train_mean = np.mean(train_times)
        self.train_std  = np.std(train_times)
        self.ONE_YEAR = (60 * 60 * 24 * 365) / self.train_mean
        self.ONE_DAY = (60 * 60 * 24) / self.train_mean
        train_times = (train_times - self.train_mean) / self.train_std

        self.sp_train = sp.coo_matrix((train_vals, (train_rows, train_cols)),
                shape=(self.num_users, self.num_items))
        self.sp_train_prev = sp.coo_matrix((train_prev_vals, (train_rows, train_cols)),
                shape=(self.num_users, self.num_items))
        self.sp_train_times = sp.coo_matrix((train_times, (train_rows, train_cols)),
                shape=(self.num_users, self.num_items))
        self.sp_train_prev_times = sp.coo_matrix((train_prev_times, (train_rows, train_cols)),
                shape=(self.num_users, self.num_items))
        
        # Repeat training processing for validation set
        if self.args.val_set == 1:
            
            # Sparse validation matrices
            val_rows = []
            val_cols = []
            val_vals = []
            val_prev_vals = []
            val_times = []
            val_prev_times = []
            for user in self.val_set:
                item = self.val_set[user][0]
                item_prev = self.val_set[user][1]
                item_time = self.val_times[user][0]
                item_prev_time = self.val_times[user][1]
                if item == -1 or item_prev == -1:
                    continue

                val_rows.append(user)
                val_cols.append(item)
                val_vals.append(1)
                val_prev_vals.append(item_prev)
                val_times.append(item_time)
                val_prev_times.append(item_prev_time)

            #normalize val timestamps with train mean/std (avoid leakage)    
            val_times = (val_times - self.train_mean) / self.train_std
                
            self.sp_val = sp.coo_matrix((val_vals, (val_rows, val_cols)),
                    shape=(self.num_users, self.num_items))
            self.sp_val_prev = sp.coo_matrix((val_prev_vals, (val_rows, val_cols)),
                    shape=(self.num_users, self.num_items))
            self.sp_val_times = sp.coo_matrix((val_times, (val_rows, val_cols)),
                    shape=(self.num_users, self.num_items))
            self.sp_val_prev_times = sp.coo_matrix((val_prev_times, (val_rows, val_cols)),
                    shape=(self.num_users, self.num_items))

        # Repeat processing for test set
        test_rows = []
        test_cols = []
        test_vals = []
        test_prev_vals = []
        test_times = []
        test_prev_times = []
        for user in self.test_set:
            item = self.test_set[user][0] #for test and val set, this_item is 0
            item_prev = self.test_set[user][1] #prev_item is 1
            item_time = self.test_times[user][0]
            item_prev_time = self.test_times[user][1]
            if item == -1 or item_prev == -1:
                continue

            test_rows.append(user)
            test_cols.append(item)
            test_vals.append(1)
            test_prev_vals.append(item_prev)
            test_times.append(item_time)
            test_prev_times.append(item_prev_time)

        # Normalize test timestamps with train mean/std (avoid leakage)    
        test_times = (test_times - self.train_mean) / self.train_std
            
        self.sp_test = sp.coo_matrix((test_vals, (test_rows, test_cols)),
                shape=(self.num_users, self.num_items))
        self.sp_test_prev = sp.coo_matrix((test_prev_vals, (test_rows, test_cols)),
                shape=(self.num_users, self.num_items))
        self.sp_test_times = sp.coo_matrix((test_times, (test_rows, test_cols)),
                shape=(self.num_users, self.num_items))
        self.sp_test_prev_times = sp.coo_matrix((test_prev_times, (test_rows, test_cols)),
                shape=(self.num_users, self.num_items))

        # Sparse training matrices for deploy set #old deploy method
        deploy_rows = []
        deploy_cols = []
        deploy_vals = []
        deploy_prev_vals = []
        
        #deploy_times = []
        #deploy_prev_times = []
        
        # Check if region data should be used
        if type(self.args.user_region_df) == pd.DataFrame:
            
            # Create empty dict to find index user starts at for efficient user lookups
            deploy_user_start_idx = {}
            
            # Start dict with user 0's starting point, index 0
            deploy_user_start_idx[0] = 0
            
            # Iterate over the copy of the training set, for previous items and user feats
            for user in self.deploy_set:
                
                # Find index of last item, to populate 'prev_item'
                last_item_idx = len(list(self.deploy_set[user])) - 1
                
                item_prev = self.deploy_set[user][last_item_idx - 1]
                
                this_user_items = self.user_to_region_item_idx.get(user)
                
                # The next user will start after this one, so add this user's num rows to this user's start idx
                deploy_user_start_idx[user + 1] = len(this_user_items) + deploy_user_start_idx[user]
                
                for item in this_user_items:
                    
                    # For each item add a row for this user, their prev item, a positive, and each item
                    deploy_rows.append(user)
                    deploy_cols.append(item)
                    deploy_vals.append(1)
                    deploy_prev_vals.append(item_prev)
                
        # If region data isn't being used, build exhaustive list of all user, item pairs
        else:
            
            # Create empty dict to find index user starts at for efficient user lookups
            deploy_user_start_idx = {}
            
            # Start dict with user 0's starting point, index 0
            deploy_user_start_idx[0] = 0

            # Iterate over the copy of the training set, for previous items and user feats
            for user in self.deploy_set:

                # Find index of last item, to populate 'prev_item'
                last_item_idx = len(list(self.deploy_set[user])) - 1

                item_prev = self.deploy_set[user][last_item_idx - 1]

                # The next user will start after this one, so add this user's num rows to this user's start idx
                deploy_user_start_idx[user + 1] = self.num_items + deploy_user_start_idx[user]

                for item in range(self.num_items):

                    # For each item add a row for this user, their prev item, a positive, and each item
                    deploy_rows.append(user)
                    deploy_cols.append(item)
                    deploy_vals.append(1)
                    deploy_prev_vals.append(item_prev)

        self.deploy_user_start_idx = deploy_user_start_idx
        self.deploy_rows = deploy_rows
        self.deploy_cols = deploy_cols
        self.deploy_vals = deploy_vals
        self.deploy_prev_vals = deploy_prev_vals
        
        # Assign the totla number of rows in deploy set as model attribute, to determine batch size
        self.deploy_num_rows = len(deploy_rows)
        
    # Function to generate model input training data, actual values
    def generate_pos_train_batch_sp(self, ith_seed = 1, items_per_user = 3):
        
        np.random.seed(ith_seed)

        # Subtract 1 to account for missing 0 index
        user_indices = np.repeat(self.sp_train.row, items_per_user) - 1
        prev_indices = np.repeat(self.sp_train_prev.data, items_per_user) - 1
        pos_indices = np.repeat(self.sp_train.col, items_per_user) - 1
        
        # Convert from indices to one hot matrices
        pos_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        pos_items = self.item_one_hot[pos_indices]

        # Horizontally stack sparse matrices to get single positive
        pos_feats = sp.hstack([pos_users, prev_items, pos_items])
            
        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data            
            user_content = self.user_feats[user_indices]
            pos_item_content = self.item_feats[pos_indices]
            pos_feats = sp.hstack([pos_feats, user_content, pos_item_content])

        return(pos_users, pos_feats)
      
    # Generate random observations of the same size as actual training input
    def generate_neg_train_batch_sp(self, ith_seed = 1, items_per_user = 3):
        
        np.random.seed(ith_seed)
        
        # Subtract 1 to account for missing 0 index
        user_indices = np.repeat(self.sp_train.row, items_per_user) - 1
        prev_indices = np.repeat(self.sp_train_prev.data, items_per_user) - 1
        
        # Check if region data should be used
        if type(self.args.user_region_df) == pd.DataFrame:
            neg_indices = []
            
            # Iterate over each user index to build training data for only this region
            for user_idx in user_indices:
                
                # Unpacked list of lists generated above into a single list of elligible items for user
                #     If no regions associated with user, defdault to all items
                all_user_items = self.user_to_region_item_idx.get(user_idx + 1, [])
                
                # Randomly sample a single element from the list
                rand_idx = random.randint(0, len(all_user_items) - 1)
                neg_indices.append(all_user_items[rand_idx] - 1)
                
        
        elif self.args.weighted_sampling == 1:
            neg_indices = np.random.choice(range(len(self.weighted_item_list)), 
                          size = len(self.sp_train.row) * items_per_user) 
        
            neg_indices = [(self.weighted_item_list[i] - 1) for i in neg_indices] 
            
        else:
            neg_indices = np.random.randint(1, self.sp_train.shape[1],
                          size=len(self.sp_train.row)*items_per_user, dtype=np.int32) - 1
        
        # Convert from indices to one hot matrices
        neg_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        neg_items = self.item_one_hot[neg_indices]

        # Horizontally stack sparse matrices to get negative feature matrices
        neg_feats = sp.hstack([neg_users, prev_items, neg_items])
            
        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data            
            user_content = self.user_feats[user_indices]
            neg_item_content = self.item_feats[neg_indices]
            neg_feats = sp.hstack([neg_feats, user_content, neg_item_content])

        return(neg_users, neg_feats)
    
    # Dataset containing only correct inputs, model should how to score these well
    def generate_pos_val_batch_sp(self, ith_seed = 1): 
        
        np.random.seed(ith_seed)
        
        user_indices = self.sp_val.row - 1
        prev_indices = self.sp_val_prev.data - 1
        pos_indices = self.sp_val.col - 1

        # Convert from indices to one-hot matrices
        pos_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        pos_items = self.item_one_hot[pos_indices]

        # Horizontally stack sparse matrices to get single positive feats
        pos_feats = sp.hstack([pos_users, prev_items, pos_items])

        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data
            user_content = self.user_feats[user_indices]
            pos_item_content = self.item_feats[pos_indices]
            pos_feats = sp.hstack([pos_feats, user_content, pos_item_content])

        return(pos_users, pos_feats)
      
    # Dataset containing random samples, model should learn these are less likely than the above
    def generate_neg_val_batch_sp(self, ith_seed = 1, items_per_user = 10): 
        
        np.random.seed(ith_seed)
        
        user_indices = np.repeat(self.sp_val.row, items_per_user) - 1
        prev_indices = np.repeat(self.sp_val_prev.data, items_per_user) - 1
        
        # Check if region data should be used
        if type(self.args.user_region_df) == pd.DataFrame:
            neg_indices = []
            
            # Iterate over each user index to build training data for only this region
            for user_idx in user_indices:
                
                # Unpacked list of lists generated above into a single list of elligible items for user
                #     If no regions associated with user, defdault to all items
                all_user_items = self.user_to_region_item_idx.get(user_idx + 1, [])
                
                # Randomly sample a single element from the list
                rand_idx = random.randint(0, len(all_user_items) - 1)
                neg_indices.append(all_user_items[rand_idx] - 1)
                
        
        elif self.args.weighted_sampling == 1:
            neg_indices = np.random.choice(range(len(self.weighted_item_list)), 
                size = len(self.sp_val.row)*items_per_user) #- 1
        
            neg_indices = [(self.weighted_item_list[i] - 1) for i in neg_indices] 
        
        else:
            neg_indices = np.random.randint(1, self.sp_val.shape[1],
                          size=len(self.sp_val.row)*items_per_user, dtype=np.int32) - 1
        
        self.neg_ind = neg_indices
        
        # Convert from indices to one-hot matrices
        neg_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        neg_items = self.item_one_hot[neg_indices]
        
        # Horizontally stack sparse matrices to get negative feature matrices
        neg_feats = sp.hstack([neg_users, prev_items, neg_items])
        
        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data
            user_content = self.user_feats[user_indices]
            neg_item_content = self.item_feats[neg_indices]
            neg_feats = sp.hstack([neg_feats, user_content, neg_item_content])
            
        return (neg_users, neg_feats)

    # Dataset containing only correct inputs, model should how to score these well
    def generate_pos_test_batch_sp(self, ith_seed = 1): 
        
        np.random.seed(ith_seed)
        
        user_indices = self.sp_test.row - 1
        prev_indices = self.sp_test_prev.data - 1
        pos_indices = self.sp_test.col - 1

        # Convert from indices to one-hot matrices
        pos_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        pos_items = self.item_one_hot[pos_indices]

        # Horizontally stack sparse matrices to get single positive feats
        pos_feats = sp.hstack([pos_users, prev_items, pos_items])

        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data
            user_content = self.user_feats[user_indices]
            pos_item_content = self.item_feats[pos_indices]
            pos_feats = sp.hstack([pos_feats, user_content, pos_item_content])

        return(pos_users, pos_feats)
      
    # Dataset containing random samples, model should learn these are less likely than the above
    def generate_neg_test_batch_sp(self, ith_seed = 1, items_per_user = 10): 
        
        np.random.seed(ith_seed)
        
        user_indices = np.repeat(self.sp_test.row, items_per_user) - 1
        prev_indices = np.repeat(self.sp_test_prev.data, items_per_user) - 1
        
        # Check if region data should be used
        if type(self.args.user_region_df) == pd.DataFrame:
            neg_indices = []
            
            # Iterate over each user index to build training data for only this region
            for user_idx in user_indices:
                
                # Unpacked list of lists generated above into a single list of elligible items for user
                #     If no regions associated with user, defdault to all items
                all_user_items = self.user_to_region_item_idx.get(user_idx + 1, [])
                
                # Randomly sample a single element from the list
                rand_idx = random.randint(0, len(all_user_items) - 1)
                neg_indices.append(all_user_items[rand_idx] - 1)
                
        
        elif self.args.weighted_sampling == 1:
            neg_indices = np.random.choice(range(len(self.weighted_item_list)), 
                size = len(self.sp_test.row)*items_per_user) #- 1
        
            neg_indices = [(self.weighted_item_list[i] - 1) for i in neg_indices] 
        
        else:
            neg_indices = np.random.randint(1, self.sp_test.shape[1],
                          size=len(self.sp_test.row)*items_per_user, dtype=np.int32) - 1
        
        self.neg_ind = neg_indices
        
        # Convert from indices to one-hot matrices
        neg_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        neg_items = self.item_one_hot[neg_indices]
        
        # Horizontally stack sparse matrices to get negative feature matrices
        neg_feats = sp.hstack([neg_users, prev_items, neg_items])
        
        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data
            user_content = self.user_feats[user_indices]
            neg_item_content = self.item_feats[neg_indices]
            neg_feats = sp.hstack([neg_feats, user_content, neg_item_content])
            
        return (neg_users, neg_feats)
        
    # All user, item pairs, to be evaluated in chunks defined by idx_sample
    def generate_deploy_batch_sp(self, idx_sample, one_pass = 1): 
   
        this_deploy_rows = [self.deploy_rows[i] for i in idx_sample]
        this_deploy_cols = [self.deploy_cols[i] for i in idx_sample]
        this_deploy_vals = [self.deploy_vals[i] for i in idx_sample]
        this_deploy_prev_vals = [self.deploy_prev_vals[i] for i in idx_sample]
        
        this_sp_deploy = sp.coo_matrix((this_deploy_vals, (this_deploy_rows, this_deploy_cols)),
                shape=(self.num_users, self.num_items))
        this_sp_deploy_prev = sp.coo_matrix((this_deploy_prev_vals, (this_deploy_rows, this_deploy_cols)),
                shape=(self.num_users, self.num_items))
        
        # Subtract 1 to account for missing 0 index
        user_indices = this_sp_deploy.row - 1
        prev_indices = this_sp_deploy_prev.data - 1
        deploy_indices = this_sp_deploy.col - 1

        # Convert from indices to one hot matrices
        pos_users = self.user_one_hot[user_indices]
        prev_items = self.item_one_hot[prev_indices]
        pos_items = self.item_one_hot[deploy_indices]

        # Horizontally stack sparse matrices to get single positive feature matrices
        pos_feats = sp.hstack([pos_users, prev_items, pos_items])

        # Full_features replaced 'content', adds both user and item features
        if self.args.features == 'full_features':
            # Join with content data            
            user_content = self.user_feats[user_indices]
            pos_item_content = self.item_feats[deploy_indices]
            pos_feats = sp.hstack([pos_feats, user_content, pos_item_content])

        return(pos_users, pos_feats)

Recommender Object

---

In [5]:
import pandas as pd
import scipy.sparse as sp
import json
import random
import numpy as np
import json
import sys
import tensorflow as tf
from tensorflow.keras import layers

# DEBUG - for time stamps
import time

# A layer object, where one layer represents the entire PRME algorithm
class PRME(layers.Layer):

    # When initialized, create the two lower-dimenstional matrices to approximate data
    def __init__(self, input_dim, factor_dim, seed):

        # Set random seed for reproducability
        tf.random.set_seed(seed)

        # Both matrices init as random, then the model learns how they can better represent data
        super(PRME, self).__init__()
        lin_init = tf.random_normal_initializer()

        # Var_linear can be thought of as the linear bias to any comparisons
        self.var_linear = tf.Variable(initial_value=lin_init(shape=(input_dim, 1),
                                                  dtype='float32'), trainable=True)

        # Var_factors can optionally have multiple dimenstions and represent a more complex space
        factor_init = tf.random_normal_initializer()
        self.var_factors = tf.Variable(initial_value=factor_init(shape=(input_dim, factor_dim),
                                                  dtype='float32'), trainable=True)

    # When the model is called, expects features as input and returns scores for each item, user pair
    def call(self, sparse_feats):
        linear_bias = tf.sparse.sparse_dense_matmul(sparse_feats, self.var_linear)
        var_emb_product = tf.reduce_sum(tf.square(self.var_factors), axis=1, keepdims = True)

        feats_sum = tf.sparse.reduce_sum(sparse_feats, axis=1, keepdims = True)
        emb_mul = tf.sparse.sparse_dense_matmul(sparse_feats, self.var_factors)
        # Term 1
        prod_term = tf.sparse.sparse_dense_matmul(sparse_feats, var_emb_product)
        term_1 = prod_term * feats_sum
        # Term 2
        term_2 = 2 * tf.reduce_sum(tf.square(emb_mul), axis=1, keepdims = True)
        # Term 3
        term_3 = term_1
        # Predictions
        preds = linear_bias + 0.5 * (term_1 - term_2 + term_3)
        return(preds)

# A layer object, where one layer represents the entire PRME algorithm
class TransFM(layers.Layer):

    # When initialized, create the two lower-dimenstional matrices to approximate data
    def __init__(self, input_dim, factor_dim, seed):

        # Set random seed for reproducability
        tf.random.set_seed(seed)

        # Both matrices init as random, then the model learns how they can better represent data
        super(TransFM, self).__init__()
        lin_init = tf.random_normal_initializer()

        # Var_linear can be thought of as the linear bias to any comparisons
        self.var_linear = tf.Variable(initial_value=lin_init(shape=(input_dim, 1),
                                                  dtype='float32'), trainable=True)

        # Var_emb_factors and var_trans_factors can optionally have multiple dimenstions
        factor_init = tf.random_normal_initializer()
        self.var_emb_factors = tf.Variable(initial_value=factor_init(shape=(input_dim, factor_dim),
                                                  dtype='float32'), trainable=True)

        self.var_trans_factors = tf.Variable(initial_value=factor_init(shape=(input_dim, factor_dim),
                                                  dtype='float32'), trainable=True)

    # When the model is called, expects features as input and returns scores for each item, user pair
    def call(self, sparse_feats):
        linear_bias = tf.sparse.sparse_dense_matmul(sparse_feats, self.var_linear)
        var_emb_product = tf.reduce_sum(tf.square(self.var_emb_factors), axis=1, keepdims = True)

        var_trans_product = tf.reduce_sum(tf.square(self.var_trans_factors), axis=1, keepdims = True)
        var_emb_trans_product = tf.reduce_sum(tf.math.multiply(self.var_emb_factors, self.var_trans_factors),
                axis=1, keepdims=True)

        feats_sum = tf.sparse.reduce_sum(sparse_feats, axis=1, keepdims = True)
        emb_mul = tf.sparse.sparse_dense_matmul(sparse_feats, self.var_emb_factors)
        trans_mul = tf.sparse.sparse_dense_matmul(sparse_feats, self.var_trans_factors)

        # Term 1
        prod_term = tf.sparse.sparse_dense_matmul(sparse_feats, var_emb_product)
        term_1 = prod_term * feats_sum

        # Term 2
        prod_term = tf.sparse.sparse_dense_matmul(sparse_feats, var_trans_product)
        term_2 = prod_term * feats_sum

        # Term 3
        term_3 = term_1

        # Term 4
        prod_term = tf.sparse.sparse_dense_matmul(sparse_feats, var_emb_trans_product)
        term_4 = 2 * prod_term * feats_sum

        # Term 5
        term_5 = 2 * tf.reduce_sum(tf.square(emb_mul), axis=1, keepdims=True)

        # Term 6
        term_6 = 2 * tf.reduce_sum(trans_mul * emb_mul, axis=1, keepdims=True)

        # Diag term
        diag_term = tf.reduce_sum(tf.square(trans_mul), axis=1, keepdims=True)

        # Predictions
        preds = linear_bias + 0.5 * (term_1 + term_2 + term_3
                + term_4 - term_5 - term_6) - 0.5 * diag_term
        return(preds)

# The class representing the recommender, contains methods for training and recommending
class Recommender:
    def __init__(self, dataset, args, ckpt_path="./tf_ckpts"):
        self.dataset = dataset
        self.args = args
        self.ckpt_path = ckpt_path

        # Use a training batch to figure out feature dimensionality and init inputs
        pos_users, pos_feats = self.dataset.generate_pos_train_batch_sp(ith_seed = 1)
        neg_users, neg_feats = self.dataset.generate_neg_train_batch_sp(ith_seed = 1)

        # Format training data to create predictions, including sparse features
        pos_users, sparse_pos_feats = self.feed_dict(pos_users, pos_feats)
        neg_users, sparse_neg_feats = self.feed_dict(neg_users, neg_feats)

        self.feature_dim = pos_feats.shape[1]
        self.args.logger('Feature dimension = ' + str(self.feature_dim) + "x" + str(self.args.num_dims), 1)

        # Init an empty keras model with Adam optimizer
        self.model = tf.keras.Sequential()
        self.opt = tf.keras.optimizers.Adam(learning_rate=self.args.starting_lr)

        if self.args.model == "PRME":
            # The only 'layer' is PRME, this declaration will initialize new random matricies
            self.model.add(PRME(self.feature_dim, self.args.num_dims, self.args.random_seed))

            # Increases as model quality increases, multiplied by -1 for minimization
            prereg_loss = lambda: tf.reduce_sum(tf.math.log(1e-6 + tf.math.sigmoid(
                    ((self.model(sparse_pos_feats) - self.model(sparse_neg_feats)) * args.secondary_reg_scale)))) * -1

            # L2 regularization, using scaling passed in from args
            l2_reg = lambda: tf.add_n([
                tf.reduce_sum(tf.math.square(self.model.layers[0].var_linear)) * self.args.linear_reg,
                tf.reduce_sum(tf.math.square(self.model.layers[0].var_factors)) * self.args.emb_reg
            ])

            # Total loss expressed as sum of model loss and l2 regularization
            self.loss_fn = lambda: tf.add_n([prereg_loss(), l2_reg()])

            # Save model and low-dimenstional features as attributes for later reference
            self.var_linear = self.model.layers[0].var_linear
            self.var_factors = self.model.layers[0].var_factors

            # Declare variables to be included in checkpoint and save checkpoint and checkpoint manager as model attributes
            self.ckpt = tf.train.Checkpoint(step = tf.Variable(1), optimizer = self.opt, var_linear = self.var_linear,
                                            var_factors = self.var_factors)
            self.ckpt_manager = tf.train.CheckpointManager(self.ckpt, self.ckpt_path, max_to_keep=3)

        if self.args.model == "TransFM":
            # The only 'layer' is PRME, this declaration will initialize new random matricies
            self.model.add(TransFM(self.feature_dim, self.args.num_dims, self.args.random_seed))

            # Increases as model quality increases, multiplied by -1 for minimization
            prereg_loss = lambda: tf.reduce_sum(tf.math.log(1e-6 + tf.math.sigmoid(
                    ((self.model(sparse_pos_feats) - self.model(sparse_neg_feats)) * args.secondary_reg_scale)))) * -1

            # L2 regularization, using scaling passed in from args
            l2_reg = lambda: tf.add_n([
                tf.reduce_sum(tf.math.square(self.model.layers[0].var_linear)) * self.args.linear_reg,
                tf.reduce_sum(tf.math.square(self.model.layers[0].var_emb_factors)) * self.args.emb_reg,
                tf.reduce_sum(tf.math.square(self.model.layers[0].var_trans_factors)) * self.args.trans_reg
            ])

            # Total loss expressed as sum of model loss and l2 regularization
            self.loss_fn = lambda: tf.add_n([prereg_loss(), l2_reg()])

            # Save model and low-dimenstional features as attributes for later reference
            self.var_linear = self.model.layers[0].var_linear
            self.var_emb_factors = self.model.layers[0].var_emb_factors
            self.var_trans_factors = self.model.layers[0].var_trans_factors

            # Declare variables to be included in checkpoint and save checkpoint and checkpoint manager as model attributes
            self.ckpt = tf.train.Checkpoint(step = tf.Variable(1), optimizer = self.opt, var_linear = self.var_linear,
                                            var_emb_factors = self.var_emb_factors, var_trans_factors = self.var_trans_factors)
            self.ckpt_manager = tf.train.CheckpointManager(self.ckpt, self.ckpt_path, max_to_keep=3)

        # Declare the minimization should occur by modifying all trainable weights
        self.var_list_fn = lambda: self.model.trainable_weights

    # Structure user and object features into sparse inputs to model
    def feed_dict(self, user_obj, feat_obj):
        pl_users = user_obj.nonzero()[1]
        pl_indices = np.hstack((feat_obj.nonzero()[0][:, None], feat_obj.nonzero()[1][:, None]))
        pl_values = feat_obj.data.astype('float32')
        pl_shape = feat_obj.shape
        sparse_feats = tf.SparseTensor(pl_indices, pl_values, pl_shape)
        return pl_users, sparse_feats

    def format_output(self, user_list, item_list, score_list, json_out = False):

        out_df = pd.DataFrame()
        out_df['user_id'] = user_list
        out_df['item_id'] = item_list
        out_df['score'] = score_list

        if json_out:
            final_json = "{"
            unique_ids = out_df['user_id'].unique()
            for pos, user_id in enumerate(unique_ids):

                final_json += "\"{}\": {{\"".format(user_id)
                user_sample = out_df[out_df['user_id'] == user_id]
                user_sample = user_sample.drop(columns = ['user_id'])
                user_sample = user_sample.set_index('item_id')
                user_json = user_sample.to_json(orient = 'columns')
                # if items and scores are empty strings, replace with empty dictionary
                if user_json[11] == "\"":
                    final_json = final_json[:-1]
                    final_json = final_json + "}"
                else:
                    final_json += user_json[11:-1]

                if pos != len(unique_ids) - 1:
                    final_json += ", "
                else:
                    final_json += "}"
            final_json = json.loads(final_json)

            return final_json
        else:
            return out_df
    # Restore the best model and generate predictions for all users and items
    def user_preds(self, user_idx, item_whitelist=None):

        # Find starting index in deploy_set
        this_idx = range(self.dataset.deploy_user_start_idx[user_idx], self.dataset.deploy_user_start_idx[user_idx + 1])

        # Generate user list and features for this sample
        deploy_users, deploy_feats = self.dataset.generate_deploy_batch_sp(idx_sample = this_idx, one_pass = 0)
        sparse_deploy_users, sparse_deploy_feats = self.feed_dict(deploy_users, deploy_feats)

        # Run recommender and find scores for user, item pairs
        deploy_preds = self.model(sparse_deploy_feats)

        # Convert user index to user_id
        user_id = self.dataset.idx_to_user[user_idx]

        # List of user ids same length of input index
        #user_list = [user_id for i in this_idx]

        # Use the block of user, item pairs to create a list of item ids for that user
        item_ids = [self.dataset.deploy_cols[i] for i in this_idx]

        # List of all item_ids used as input
        item_list = np.array([self.dataset.idx_to_item[i] for i in item_ids])

        # List of all scores output by the model, formatted as floats
        score_list = deploy_preds.numpy()

        # Make sure top_k isn't larger than all items in region
        adjusted_k = min(len(this_idx), self.args.return_k_preds)

        # Find list of indices representing top k scores, sorted for largest values
        top_k_indices = score_list.argsort(axis=0)[::-1][:adjusted_k]

        # Find scores / items corresponding to highest indices
        top_k_scores = [score_list[i][0][0] for i in top_k_indices]
        top_k_items = [item_list[i][0] for i in top_k_indices]

        # List of user ids same length of output
        user_list = [user_id for i in top_k_indices]

        return user_list, top_k_items, top_k_scores

    def get_all_user_preds(self, final_user_list, final_item_list, final_score_list, this_chunk=0, num_chunks=1, user_id=None, json_out=False):
        # iterate over all users (or partition of users) and produce predictions

        # One more chunk then evenly divisible to ensure all users are represented
        users_per_chunk = (self.dataset.num_users // num_chunks) + 1

        user_start = users_per_chunk * this_chunk

        user_end = users_per_chunk * (this_chunk + 1)

        # Make sure end is at most total number of users
        user_end = min(user_end, self.dataset.num_users)

        # Create a prediction chunk for each iteration and merge into output
        for this_user_idx in range(user_start, user_end):

            # Pass in this user's index and return three lists of same length, to be merged into df
            user_list, item_list, score_list = self.user_preds(user_idx = this_user_idx)

            final_user_list.extend(user_list)
            final_item_list.extend(item_list)
            final_score_list.extend(score_list)

        # Structure lists into either pd.Dataframe or json, depending on json_out val
        deploy_out = self.format_output(final_user_list, final_item_list, final_score_list, json_out)

        return deploy_out

    # Restore the best model and generate predictions for all users and items
    def deploy_preds(self, this_chunk = 0, num_chunks = 1, user_id=None, json_out=False):

        # Restore best model weights
        final_ckpt = self.ckpt_manager.latest_checkpoint
        self.ckpt.restore(final_ckpt)

        # Init lists used in case more than 1 user predictions requested
        final_user_list = []
        final_item_list = []
        final_score_list = []

        if type(user_id) == list:
            if 0 in user_id:
                deploy_out = self.get_all_user_preds(final_user_list, final_item_list, final_score_list, this_chunk, num_chunks, user_id, json_out)
                return deploy_out
            # Iterate over users and only include predictions for included users
            for this_user_id in user_id:

                # Convert ids to indices
                this_user_idx = self.dataset.user_to_idx.get(this_user_id)

                if this_user_idx is not None:
                    # Pass in this user's index and return three lists of same length, to be merged into df
                    user_list, item_list, score_list = self.user_preds(user_idx = this_user_idx)
                else:
                    user_list = [this_user_id]
                    item_list = [""]
                    score_list = [""]
                final_user_list.extend(user_list)
                final_item_list.extend(item_list)
                final_score_list.extend(score_list)

            # Structure lists into either pd.Dataframe or json, depending on json_out val
            deploy_out = self.format_output(final_user_list, final_item_list, final_score_list, json_out)

            return(deploy_out)

        # If no user_id, or list of user_ids, iterate over all users (or partition of users)
        else:
            deploy_out = self.get_all_user_preds(final_user_list, final_item_list, final_score_list, this_chunk, num_chunks, user_id, json_out)
            return deploy_out

    # Function to train, test, and save best model checkpoint
    def train(self):

        # Setting tf random seeds must be done inside each graph
        np.random.seed(self.args.random_seed)
        tf.random.set_seed(self.args.random_seed)

        # Initialize bests, starting with 0/-1 so that first score becomes the 'best' after first iteration
        best_epoch = 0
        best_val_acc = -1
        best_test_acc = -1

        # Iterate through epoch until at most max_iters, but can auto-stop if model overfits
        for epoch in range(self.args.max_iters):

            # Iterate seed
            #ith_seed = self.args.random_seed * epoch

            # Generate training data
            pos_users, pos_feats = self.dataset.generate_pos_train_batch_sp(items_per_user = self.args.num_train_samples)
            neg_users, neg_feats = self.dataset.generate_neg_train_batch_sp(items_per_user = self.args.num_train_samples)

            # Format training data to create predictions, including sparse features
            pos_users, sparse_pos_feats = self.feed_dict(pos_users, pos_feats)
            neg_users, sparse_neg_feats = self.feed_dict(neg_users, neg_feats)

            # Train model by minimizing lost by modifying the trainable variables
            self.opt.minimize(self.loss_fn, self.var_list_fn)

            # If this epoch should be an evaluation step, calculate accuracy and check for new best model
            if epoch % self.args.eval_freq == 0:

                # Check if val_set is being used
                if self.args.val_set == 1:
 
                    # Generate list of users and features from val dataset
                    pos_users, pos_feats = self.dataset.generate_pos_val_batch_sp()
                    neg_users, neg_feats = self.dataset.generate_neg_val_batch_sp(items_per_user = self.args.num_val_samples)

                else:

                    # Generate list of users and features from test dataset
                    pos_users, pos_feats = self.dataset.generate_pos_test_batch_sp()
                    neg_users, neg_feats = self.dataset.generate_neg_test_batch_sp(items_per_user = self.args.num_val_samples)

                # Format training data to create predictions, including sparse features
                pos_users, sparse_pos_feats = self.feed_dict(pos_users, pos_feats)
                neg_users, sparse_neg_feats = self.feed_dict(neg_users, neg_feats)

                pos_preds = self.model(sparse_pos_feats)

                # Get the correct number of pos rows to compare with the neg rows
                pos_preds = tf.tile(pos_preds, tf.constant([1, self.args.num_val_samples]))

                # Reshape the above to match neg_preds shape
                pos_preds = tf.reshape(pos_preds, [-1, 1])

                # Model scores random negative samples, multiple per user
                neg_preds = self.model(sparse_neg_feats)

                # Validation Accuracy - how often the correct answer is predicted over the random one
                val_acc = np.mean(tf.dtypes.cast(((pos_preds - neg_preds) > 0), tf.float32))
                self.args.logger("Epoch: " + str(epoch) + ", Current val ACC: " + str(val_acc), 1)

                # If val_acc is a new best, save this iteration and check test_val
                if val_acc > best_val_acc:
                    best_epoch = epoch
                    best_val_acc = val_acc
                      
                    # If using a seperate test set, check the test_acc as well
                    if self.args.val_set == 1:

                        # Generate list of users and features from test dataset
                        pos_users, pos_feats = self.dataset.generate_pos_test_batch_sp()
                        neg_users, neg_feats = self.dataset.generate_neg_test_batch_sp(items_per_user = self.args.num_val_samples)

                        # Format training data to create predictions, including sparse features
                        pos_users, sparse_pos_feats = self.feed_dict(pos_users, pos_feats)
                        neg_users, sparse_neg_feats = self.feed_dict(neg_users, neg_feats)

                        pos_preds = self.model(sparse_pos_feats)

                        # Get the correct number of pos rows to compare with the neg rows
                        pos_preds = tf.tile(pos_preds, tf.constant([1, self.args.num_val_samples]))

                        # Reshape the above to match neg_preds shape
                        pos_preds = tf.reshape(pos_preds, [-1, 1])

                        # Model scores random negative samples, multiple per user
                        neg_preds = self.model(sparse_neg_feats)

                        # Validation Accuracy - how often the correct answer is predicted over the random one
                        best_test_acc = np.mean(tf.dtypes.cast(((pos_preds - neg_preds) > 0), tf.float32))
                        self.args.logger('New best epoch: ' + str(best_epoch) + ', Test ACC: ' +str(best_test_acc), 1)

                    # If option to generate deployment predictions is enabled, save checkpoints
                    if self.args.deploy_preds == 1:

                        #dynamically create the name of the current checkpoint using epoch
                        self.most_recent_checkpoint = 'checkpoint_' + str(epoch)
                        self.ckpt_manager.save()

                # Early stopping criteria
                else:

                    # Make sure not quitting too early
                    if epoch < self.args.min_epoch:
                        pass

                    # If no improvement in quit_delta rounds, print final results, create preds
                    elif epoch >= (best_epoch + self.args.quit_delta):

                        self.args.logger('Early stopping, best epoch: ' + str(best_epoch) + ', Val ACC: ' +
                                    str(best_val_acc), 1)

                        break
        # Completed all iterations through max iters, end train
        self.args.logger('Finished, best epoch: ' + str(best_epoch) + ', Val ACC: ' +
                                    str(best_val_acc) + ', Test ACC: ' +
                                    str(best_test_acc), 1)

Create MovieLens Dataset

---

In [6]:
MovieLens_dataset = Dataset(args)

First pass
	num_users = 943
	num_items = 1682
	df_shape  = (100000, 4)
Collected user counts...
Collected item counts...
User filtering done...
Item filtering done...
Second pass
	num_users = 943
	num_items = 1473
	df_shape  = (99723, 4)
Constructing datasets...
Trying to structure dataset with validation set, this isn't fully supported. 
Reading user demographics...
Reading item demographics...


Create and Train Model

---

In [7]:
# Init a recommender
MovieLens_PRME = Recommender(MovieLens_dataset, args)

# Train model
MovieLens_PRME.train()

Feature dimension = 3908x3
Epoch: 0, Current val ACC: 0.6065748
New best epoch: 0, Test ACC: 0.60774124
Epoch: 10, Current val ACC: 0.72725344
New best epoch: 10, Test ACC: 0.6902439
Epoch: 20, Current val ACC: 0.7437964
New best epoch: 20, Test ACC: 0.70265114
Epoch: 30, Current val ACC: 0.7525981
New best epoch: 30, Test ACC: 0.70795333
Epoch: 40, Current val ACC: 0.7609756
New best epoch: 40, Test ACC: 0.71686107
Epoch: 50, Current val ACC: 0.76277834
New best epoch: 50, Test ACC: 0.7271474
Epoch: 60, Current val ACC: 0.77433723
New best epoch: 60, Test ACC: 0.73775184
Epoch: 70, Current val ACC: 0.7954401
New best epoch: 70, Test ACC: 0.7562036
Epoch: 80, Current val ACC: 0.8049841
New best epoch: 80, Test ACC: 0.7681866
Epoch: 90, Current val ACC: 0.807105
New best epoch: 90, Test ACC: 0.7752916
Finished, best epoch: 90, Val ACC: 0.807105, Test ACC: 0.7752916


Save Recommendations 

---

In [10]:
# Save output preds of model as out_df
MovieLens_recommendations_df = MovieLens_PRME.deploy_preds()

Show Recommendations Preview

---

In [11]:
MovieLens_recommendations_df

Unnamed: 0,user_id,item_id,score
0,195,241,52.837547
1,195,8,44.249889
2,195,14,43.996578
3,195,217,43.925175
4,195,63,43.837166
...,...,...,...
94295,872,426,78.377869
94296,872,162,78.374184
94297,872,14,78.362679
94298,872,435,78.359436
