In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np

## Read Dataset

In [None]:
#df = pd.read_csv('../data/movielens-dataset/ratings.dat')
#df = pd.read_csv('../data/movielens100k/ratings.csv', sep="::", names=['userId', 'movieId', 'rating', 'timestamp'])
df = pd.read_csv('../data/movielens100k/ratings.csv')
df.head()

In [None]:
df.rating.plot(kind='hist')

In [None]:
df_user_idx = df[['userId']].drop_duplicates().reset_index(drop=True)\
                .reset_index().rename(columns={'index': 'userId_idx'})
df_user_idx.shape

In [None]:
df_user_idx.head()

In [None]:
df_movie_idx = df[['movieId']].drop_duplicates().reset_index(drop=True)\
                    .reset_index().rename(columns={'index': 'movieId_idx'})
df_movie_idx.shape

In [None]:
df_movie_idx.head()

### Preparate Train Dataset

In [None]:
from scipy.sparse.csr import csr_matrix
from scipy.sparse import save_npz, load_npz


In [None]:
df_train = df#.groupby('userId').agg({'movieId': lambda x: list(x), 'rating': lambda x: list(x)})

# Merge Dataset
df_train = df_train.merge(
                df_user_idx, on='userId', how='inner')\
            .merge(
                df_movie_idx, on='movieId', how='inner')


df_train.head()

In [None]:
df_train.rating.plot(kind='hist')

In [None]:
#df.groupby('userId').agg({'movieId': lambda x: list(x), 'rating': lambda x: list(x)})

In [None]:
#df_train['rating'] = (df_train['rating'] > 0).astype(float)
spc_data = csr_matrix((df_train['rating'].values, (df_train.userId_idx.values, df_train.movieId_idx.values)), 
                shape=(len(df_user_idx), len(df_movie_idx)))

spc_data.toarray()

In [None]:
import gc
gc.collect()
del df

## Model

In [None]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout

In [None]:
class AutoEncRec(Model):
    '''Vanilla Autoencer'''
    
    def __init__(self, input_size, n_dims = [64, 32, 64], dropout_rate = 0.2):
        super(AutoEncRec,self).__init__()
        self.input_size = input_size

        self.enc_1 = Dense(n_dims[0], input_shape = (input_size, ), activation='selu')
        self.enc_2 = Dense(n_dims[1], activation='selu')
        self.dec_1 = Dense(n_dims[2], activation='selu')
        self.dec_2 = Dense(input_size, activation='linear')     
        self.dropout = Dropout(dropout_rate)

    def encoder(self, x):
        net = self.enc_1(x)
        net = self.enc_2(net)
        return net
    
    def decoder(self, x):
        net = self.dec_1(x)
        net = self.dec_2(net)
        return net

    def call(self, inputs):
        net = self.decoder(self.dropout(self.encoder(inputs)))
        return net

def masked_mse(mask_value):
    def f(y_true, y_pred):
        mask_true = K.cast(K.not_equal(y_true, mask_value), K.floatx())
        masked_squared_error = K.square(mask_true * (y_true - y_pred))
        # in case mask_true is 0 everywhere, the error would be nan, therefore divide by at least 1
        # this doesn't change anything as where sum(mask_true)==0, sum(masked_squared_error)==0 as well
        masked_mse = K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, axis=-1), 1)
        return masked_mse
    f.__name__ = str('Masked MSE (mask_value={})'.format(mask_value))
    return f

In [None]:
input_size = len(df_movie_idx)

model = AutoEncRec(input_size = input_size, n_dims = [128, 64, 128])
model.compile(optimizer='adam', loss=masked_mse(0.0))

In [None]:
X_train    = spc_data.toarray()

#train_data = tf.data.Dataset.from_tensor_slices(X_train).batch(128).shuffle(buffer_size = 1024) 

In [None]:
hist = model.fit(X_train, X_train, 
                validation_split=0.2, 
                batch_size = 10, 
                epochs = 20)

In [None]:
def plot_hist(hist):
    # summarize history for loss
    fig, ax = plt.subplots()  # create figure & 1 axis

    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')

    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])

plot_hist(hist)

## Recommender

In [None]:
#df_movie = pd.read_csv('../data/ml-latest-small/movies.csv')
df_movie = pd.read_csv('../data/movielens100k/movies.csv')
df_movie = df_movie.merge(df_movie_idx).set_index('movieId_idx')
df_movie.head()

In [None]:
id_view    = [1, 3114, 87222, 84944, 260, 1196, 1210, 2628, 79006, 2116, 7153, 5952]
idx_view   = df_movie_idx.set_index("movieId").loc[id_view].values.reshape(-1)

In [None]:
df_movie[df_movie.title.str.contains("Lord of the Rings")]

In [None]:
df_movie[df_movie.movieId.isin(id_view)]

In [None]:
data = np.ones(len(idx_view))
row  = np.zeros(len(idx_view))
col  = idx_view

data_input = csr_matrix((data, (row, col)), shape=(1, input_size)).toarray()

In [None]:
data_pred  = model.predict(data_input)[0]

In [None]:
idx_pred  = list(set(list(range(len(df_movie)))) - set(idx_view))

data_pred = model.predict(data_input)[0]
data_pred = dict(
                sorted(
                    zip(
                        list(idx_pred), 
                        list(data_pred[idx_pred].astype(float))
                    ), 
                key=lambda x: x[1],
            reverse=True))
data_pred            

In [None]:
df_movie.loc[list(data_pred.keys())][:10]

In [None]:
df_movie.loc[idx_view][:10]

In [None]:
s

In [None]:
d