# Collaborative Filtering Recommender System based on "Deep" Autoencoders

### Purpose
To test a vanilla "deep" autoencoder model based off of collaborative filtering using Keras and Tensorflow

### Methodology
This notebook assumes that the model will receive a pre-processed dataset of user-item interactions. For simplification purposes, it uses the [small movielens dataset](https://surprise.readthedocs.io/en/stable/dataset.html)

### Author Information
Nishant Aswani (@niniack)


# Setup

## Library import
We import all the required Python libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor, als, basic, user_knn
from lenskit import topn
from scipy import sparse
from lenskit.data import sparse_ratings

# Dataset
from lenskit.datasets import ML100K, ML1M

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations and debugging
import plotly.graph_objs as go
from pprintpp import pprint as pp
import logging

# Tensorflow
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

### Downloading ML100K Dataset

In [2]:
# %%!
# wget -q -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip

## This unzip method may not work!
# unzip -f ml-100k.zip

### Data Exploration

The lenskit ML100K dataset provides the following: movies, ratings, users

In [3]:
movielens = ML100K('../ml-100k')

In [4]:
ratings = movielens.ratings
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [5]:
len(ratings)

100000

In [6]:
users = movielens.users
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [7]:
len(users)

943

In [8]:
movies = movielens.movies
movies.head()

Unnamed: 0_level_0,title,release,vidrelease,imdb,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [9]:
len(movies)

1682

# Building the Autoencoder


In [31]:
class AutoEncoder():
    
    def build(self, matrix):
                
        encoder_input = Input(shape=(matrix.shape[1]), name='Input', dtype='int8')
        encoder_latent_one = Dense(64, activation='selu', name='Encoder_Latent_One')(encoder_input)
#         encoder_latent_two = Dense(256, activation='selu', name='Encoder_Latent_Two')(encoder_latent_one)
        encoder_latent_three = Dense(32, activation='selu', name='Encoder_Latent_Three')(encoder_latent_one)

        dropout = Dropout(0.35, name='Dropout')(encoder_latent_three)
        
#         decoder_latent_one = Dense(128, activation='selu', name='Decoder_Latent_One')(dropout)
        decoder_latent_two = Dense(64, activation='selu', name='Decoder_Latent_Two')(dropout)
        decoder_output = Dense(matrix.shape[1], activation='softmax', name='Output')(decoder_latent_two)

        model = Model(encoder_input, decoder_output)
        return model
    

In [32]:
uir, _, _= sparse_ratings(ratings, scipy=True)

In [33]:
def rmse(y_true, y_pred):
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
    masked_squared_error = mask_true * K.square((y_true - y_pred))
    masked_mse = K.sum(masked_squared_error, axis=-1) / K.sum(mask_true, axis=-1)
    return K.sqrt(masked_mse)

In [34]:
model = AutoEncoder()
model = model.build(uir)
model.compile(optimizer = Adam(lr=0.001), loss=rmse, metrics=['accuracy'])
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 1682)]            0         
_________________________________________________________________
Encoder_Latent_One (Dense)   (None, 64)                107712    
_________________________________________________________________
Encoder_Latent_Three (Dense) (None, 32)                2080      
_________________________________________________________________
Dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
Decoder_Latent_Two (Dense)   (None, 64)                2112      
_________________________________________________________________
Output (Dense)               (None, 1682)              109330    
Total params: 221,234
Trainable params: 221,234
Non-trainable params: 0
_____________________________________________________

In [35]:
X = uir.toarray()
y = uir.toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [36]:
hist = model.fit(x=X_train, y=y_train,
                  epochs=200,
                  batch_size=32,
                  validation_split=0.2)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200


Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200


Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200


Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [37]:
def plot_hist(hist):

    x = np.arange(10)

    fig = go.Figure(data=go.Scatter(y=hist.history['loss']))
    fig.update_layout(title="model loss", 
                    xaxis_title="epoch",
                    yaxis_title="loss",
                    )
    fig.show()

plot_hist(hist)

In [38]:
test_loss = model.evaluate(X_test, X_test)
test_loss



[3.743685007095337, 0.08421052992343903]

In [39]:
new_matrix = model.predict(X_train)

In [40]:
print(X_train[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]]


In [41]:
print(new_matrix[:5])

[[3.4645340e-12 1.2937381e-19 1.5261524e-20 ... 2.4920434e-19
  3.9274649e-19 2.6312877e-20]
 [5.9108607e-10 2.3786411e-12 1.1261865e-12 ... 1.3120129e-12
  1.5463535e-12 1.2609612e-12]
 [2.8969255e-14 7.2423726e-16 5.5400555e-16 ... 2.9711981e-16
  4.7717474e-16 3.8383332e-16]
 [1.9799973e-12 2.3488965e-17 2.7948156e-17 ... 1.7275401e-17
  8.6469772e-18 3.9192802e-18]
 [1.1503034e-15 4.4682132e-25 1.5302045e-26 ... 5.9651645e-25
  7.6550702e-25 3.1352519e-26]]
