# Collaborative Filtering Recommender System based on Autoencoders

### Purpose
To get a working autoencoder model based off of collaborative filtering

### Methodology
This notebook assumes that the model will receive a pre-processed dataset of user-item interactions. For simplification purposes, it uses the [small movielens dataset](https://surprise.readthedocs.io/en/stable/dataset.html)

### Author Information
Nishant Aswani (@niniack)


# Setup

## Library import
We import all the required Python libraries

In [62]:
# Data manipulation
import pandas as pd
import numpy as np
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor, als, basic, user_knn
from lenskit import topn
from scipy.sparse import csr_matrix, diags, linalg, hstack, vstack, lil_matrix
from scipy.linalg import inv
from lenskit.data import sparse_ratings

# Dataset
from lenskit.datasets import ML100K, ML1M

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations and debugging
import plotly.graph_objs as go
from pprintpp import pprint as pp
import logging

# Tensorflow
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

### Data Exploration

The lenskit ML100K dataset provides the following: movies, ratings, users

In [25]:
movielens = ML100K('../ml-100k')

In [26]:
ratings = movielens.ratings
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [27]:
users = movielens.users
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [28]:
movies = movielens.movies
movies.head()

Unnamed: 0_level_0,title,release,vidrelease,imdb,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


# Building EASE

In [63]:
class EASE(Recommender, Predictor):
    
    def __init__(self, selector = None):
        # Set selector
        if selector is None:
            self.selector = basic.UnratedItemCandidateSelector()
        else:
            self.selector = selector
            
        # Enable logging 
        _logger = logging.getLogger(__name__)
        
    def __str__(self):
        return 'EASE'
    
    # Add a user to the ratings matrix
    def add_user(self, user_id):
        
        # Check if user_id to be added already exists
        try:
            assert (user_id in self.user_index_) == False, "User ID already exists! Not adding anything..."
        
        except AssertionError as e:
            print(e)
            exit(1)

        # Build a sparse matrix of length of number of items
        tmp_sparse_row = csr_matrix(np.zeros((1,len(self.item_index_))))

        # Vertically stack temporary matrix to original matrix
        self.rating_matrix_ = vstack([self.rating_matrix_, tmp_sparse_row])
        
        # Update user index
        self.user_index_ = self.user_index_.append(pd.Index([user_id]))
    
    # Add a user to the ratings matrix
    def add_item(self, item_id):
        
        # Check if item_id to be added already exists
        try:
            assert (item_id in self.item_index_) == False, "Item ID already exists!"
        
        except AssertionError as e:
            print(e)
            exit(1)
        
        # Build a sparse matrix of length of number of users
        tmp_sparse_col = csr_matrix(np.zeros((len(self.user_index_),1)))
        
        # Horizontally stack temporary matrix to original matrix
        self.rating_matrix_ = hstack([self.rating_matrix_, tmp_sparse_col])
        
        # Update item index
        self.item_index_ = self.item_index_.append(pd.Index([item_id]))
    
    # Add a user-item interaction for existing users and items
    def add_interactions(self, user_id, item_id, rating):
    
        # Check if inputs are lists and all input list lengths are equal
        assert type(user_id) == list, "Input user_id is not a list"
        assert type(item_id) == list , "Input item_id is not a list"
        assert type(rating) == list, "Input rating is not a list"
        assert len(user_id) == len(item_id) == len(rating), "Input lists are not of the same length"
        
        # Build a temporary sparse LIL matrix
        
        tmp_ratings = lil_matrix(self.rating_matrix_.shape)
        
        for i in range(len(user_id)):
            
            # Obtain locations from ID
            user_pos, = np.where(self.user_index_ == user_id[i])[0]
            item_pos, = np.where(self.item_index_ == item_id[i])[0]
            
            # Fill into temporary sparse matrix
            tmp_ratings[user_pos, item_pos] = rating[i]
                    
        # Convert temporary LIL to CSR
        tmp_ratings = tmp_ratings.tocsr()
        
        # Add temporary CSR to main ratings matrix
        self.rating_matrix_ += tmp_ratings

    def fit(self, ratings, lambda_: float = 50, implicit=True):
        
        """
        ratings: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        
        matrix = ratings
        matrix['rating'] = 1
        
        # Get sparse representation in CSR format
        uir, users, items = sparse_ratings(matrix, scipy=True)
        
        # Store ratings
        self.rating_matrix_ = uir
        self.user_index_ = users
        self.item_index_ = items
        
        # Calculate score
        G = uir.transpose().dot(uir)
        lambda_diag = diags(np.full((G.shape[0]), lambda_))
        G += lambda_diag
        P = inv(G.toarray())
        B = P / (-np.diag(P))
        
        np.fill_diagonal(B, 0)
        self.B = B
        self.score = uir.dot(B)
        
        # Reduce candidate space to unseen items
        self.selector.fit(ratings)
    
    def recommend(self, user_id, candidates=None, ratings=None):
        
        # Reduce candidate space and store candidates with item ID
        if candidates is None:
            candidates = self.selector.candidates(user_id, ratings)
            
        user_index, = np.where(self.user_index_ == user_id)[0]
        
        # Predict ratings and scores for all unseen items
        prediction_score_df = self.predict_for_user(user_index, candidates)
                
        return(prediction_score_df)
    
    def predict_for_user(self, user, items):
        
        # Grab item indices
        item_indices = []
        
        for i in items:
            item_indices.append(np.where(self.item_index_ == i)[0][0])
        
        # Grab the score vector for given user index
        all_scores = self.score[user]
        
        # Grab the unseen items
        unseen_item_scores = np.take(all_scores, item_indices)
        
        results = {'score':unseen_item_scores}
        return pd.DataFrame(results, index=items)
        

In [64]:
%%time
algo_ease = EASE()
algo_ease.fit(ratings)

CPU times: user 2.01 s, sys: 1.72 s, total: 3.73 s
Wall time: 708 ms


### Testing the Interface to Add an Item

In [65]:
# Show the number of users
print("Number of Items: " + str(len(algo_ease.item_index_)))
print(algo_ease.item_index_[-10:])

Number of Items: 1682
Int64Index([1673, 1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682], dtype='int64', name='item')


In [66]:
%%time

# Add user
algo_ease.add_item(5089)

# Show the updated number of users
print("Number of Items: " + str(len(algo_ease.item_index_)))
print(algo_ease.item_index_[-10:])
print('\n')

Number of Items: 1683
Int64Index([1674, 1675, 1676, 1677, 1678, 1679, 1680, 1681, 1682, 5089], dtype='int64')


CPU times: user 8.54 ms, sys: 263 µs, total: 8.81 ms
Wall time: 8.07 ms


### Testing the Interface to Add a User

In [67]:
# Show the number of users
print("Number of Users: " + str(len(algo_ease.user_index_)))
print(algo_ease.user_index_[-10:])

Number of Users: 943
Int64Index([934, 935, 936, 937, 938, 939, 940, 941, 942, 943], dtype='int64', name='user')


In [68]:
%%time

# Add user
algo_ease.add_user(1001)

# Show the updated number of users
print("Number of Users: " + str(len(algo_ease.user_index_)))
print(algo_ease.user_index_[-10:])
print('\n')

Number of Users: 944
Int64Index([935, 936, 937, 938, 939, 940, 941, 942, 943, 1001], dtype='int64')


CPU times: user 4.23 ms, sys: 110 µs, total: 4.34 ms
Wall time: 3.73 ms


### Testing the Interface to Add a User-Item Interaction

In [69]:
# Show interaction of user 1 with item 273
# (The user hasn't interacted with that item!)
algo_ease.rating_matrix_.toarray()[1-1][273-1]

0.0

In [70]:
%%time 

# Add interaction between user 1 and item 273
algo_ease.add_interactions([1], [273], [3])

# Show updated list of interactions
# (The interaction is updated!)
algo_ease.rating_matrix_.toarray()[1-1][273-1]

CPU times: user 17.1 ms, sys: 551 µs, total: 17.7 ms
Wall time: 23.8 ms


3.0

### Testing the Interface to Obtain Recommendations

In [353]:
recs = algo_ease.recommend(5)
recs.sort_values(
    by=["score"],
    ascending=False
).head(30)

Unnamed: 0,score
588,0.415662
238,0.408615
82,0.342816
7,0.338387
265,0.326819
202,0.32378
739,0.320713
118,0.31965
195,0.318761
67,0.305614
