# Collaborative Filtering Recommender System based on Autoencoders

### Purpose
To get a working autoencoder model based off of collaborative filtering

### Methodology
This notebook assumes that the model will receive a pre-processed dataset of user-item interactions. For simplification purposes, it uses the [small movielens dataset](https://surprise.readthedocs.io/en/stable/dataset.html)

### Author Information
Nishant Aswani (@niniack)


# Setup

## Library import
We import all the required Python libraries

In [154]:
# Data manipulation
import pandas as pd
import numpy as np
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor, als, basic, user_knn
from lenskit import topn
from scipy.sparse import csr_matrix, diags, linalg
from scipy.linalg import inv
from lenskit.data import sparse_ratings

# Dataset
from lenskit.datasets import ML100K, ML1M

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations and debugging
import plotly.graph_objs as go
from pprintpp import pprint as pp
import logging

# Tensorflow
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

### Data Exploration

The lenskit ML100K dataset provides the following: movies, ratings, users

In [155]:
movielens = ML100K('../ml-100k')

In [156]:
ratings = movielens.ratings
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [40]:
users = movielens.users
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [41]:
movies = movielens.movies
movies.head()

Unnamed: 0_level_0,title,release,vidrelease,imdb,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


# Building EASE

In [350]:
class EASE(Recommender, Predictor):
    
    def __init__(self, selector = None):
        # Set selector
        if selector is None:
            self.selector = basic.UnratedItemCandidateSelector()
        else:
            self.selector = selector
            
        # Enable logging 
        _logger = logging.getLogger(__name__)

    def fit(self, ratings, lambda_: float = 50, implicit=True):
        
        """
        ratings: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        
        matrix = ratings
        matrix['rating'] = 1
        
        # Get sparse representation in CSR format
        uir, users, items = sparse_ratings(matrix, scipy=True)
        
        # Store ratings
        self.rating_matrix_ = uir
        self.user_index_ = users
        self.item_index_ = items
        
        # Calculate score
        G = uir.transpose().dot(uir)
        lambda_diag = diags(np.full((G.shape[0]), lambda_))
        G += lambda_diag
        P = inv(G.toarray())
        B = P / (-np.diag(P))
        
        np.fill_diagonal(B, 0)
        self.B = B
        self.score = uir.dot(B)
        
        # Reduce candidate space to unseen items
        self.selector.fit(ratings)
    
    def recommend(self, user_id, candidates=None, ratings=None):
        
        # Reduce candidate space and store candidates with item ID
        if candidates is None:
            candidates = self.selector.candidates(user_id, ratings)
            
        user_index, = np.where(self.user_index_ == user_id)[0]
        
        # Predict ratings and scores for all unseen items
        prediction_score_df = self.predict_for_user(user_index, candidates)
                
        return(prediction_score_df)
    
    def predict_for_user(self, user, items):
        
        # Grab item indices
        item_indices = []
        
        for i in items:
            item_indices.append(np.where(self.item_index_ == i)[0][0])
        
        # Grab the score vector for given user index
        all_scores = self.score[user]
        
        # Grab the unseen items
        unseen_item_scores = np.take(all_scores, item_indices)
        
        results = {'score':unseen_item_scores}
        return pd.DataFrame(results, index=items)
        

In [351]:
%%time
algo_ease = EASE()
algo_ease.fit(ratings)

CPU times: user 1.94 s, sys: 1.4 s, total: 3.34 s
Wall time: 638 ms


In [352]:
print(algo_ease.item_index_.shape)

(1682,)


In [353]:
recs = algo_ease.recommend(5)
recs.sort_values(
    by=["score"],
    ascending=False
).head(30)

Unnamed: 0,score
588,0.415662
238,0.408615
82,0.342816
7,0.338387
265,0.326819
202,0.32378
739,0.320713
118,0.31965
195,0.318761
67,0.305614
