In [None]:
%matplotlib inline
import importlib
import scipy
import pandas as pd
import numpy as np
from tqdm import *
from keras_tqdm import TQDMNotebookCallback
import keras
from keras.layers import *
from keras.models import Model, Sequential

In [None]:
movies = pd.read_csv("data/movies.dat", sep="::", usecols=['movie_id', 'title'])
ratings = pd.read_csv("data/ratings.dat", sep="::", usecols=['user_id', 'movie_id', 'rating', 'timestamp'])

In [None]:
# make movie ids sequential
for i, row in movies.iterrows():
    ratings.loc[ratings['movie_id'] == row.movie_id, 'movie_id'] = i
    movies.set_value(i, 'movie_id', i)


In [None]:
#pickle.dump( x_train, open( "data/movies_x_train.pickle", "wb" ) )
#pickle.dump( x_val, open( "data/movies_x_val.pickle", "wb" ) )

#ratings = pickle.load( open( "data/movie_ratings.pickle", "rb" ) )
#movies = pickle.load( open( "data/movies.pickle", "rb" ) )
#x_train = pickle.load(open( "data/movies_x_train.pickle", "rb" ) )
#x_val = pickle.load(open( "data/movies_x_val.pickle", "rb" ) )

In [None]:
ratings = ratings.sort_values(['user_id', 'timestamp'])

In [None]:
def build_data_array(ratings_subset):
    user_ids = np.array(ratings_subset.user_id.unique())
    user_ids.sort()
    arr = np.empty((user_ids.shape[0], movies.shape[0]), 'float32')
    i = 0
    for user_id in tqdm(user_ids):
        ratings_by_user_id = ratings[ratings["user_id"] == user_id]
        for j in ratings_by_user_id.itertuples():
            if(j.rating > 10 or j.rating < 0):
                print(user_id)
                print(j.movie_id)
            arr[i][j.movie_id] = j.rating
        i += 1
    
    return arr

In [None]:
def masked_mean_squared_error(y_true, y_pred):
    mask_value = 0
    mask_true = K.cast(K.not_equal(y_true, mask_value), K.floatx())
    masked_squared_error = K.square(mask_true * (y_true - y_pred))
    masked_mse = K.sum(masked_squared_error, axis=-1) / K.sum(mask_true, axis=-1)
    return masked_mse

In [None]:
x = ratings
num_validation_samples = int(0.2 * x.shape[0])
print(num_validation_samples)
grouped = ratings.groupby(['movie_id'])['movie_id'].count().sort_values(ascending=True)
x = x[x.movie_id.isin(grouped[grouped >= 5].index)]
user_id = 0

while(len(ratings[ratings['user_id'] < user_id]) < num_validation_samples):
    user_id += 2

x_train = x[x.user_id >= user_id]
x_val = x[x.user_id < user_id]

x_train = build_data_array(x_train)
x_val = build_data_array(x_val)

In [None]:
def train(model, inputs, n, refeed = False):
    params = {'verbose': 0, 'callbacks': [TQDMNotebookCallback(leave_inner=True)]}
    for i in range(0, n):
        if(i % 5 == 0):
            print("Training data ", i)
        model.fit(inputs, inputs, validation_data=(x_val, x_val),
                epochs=1,
                batch_size=128, **params)
        if(refeed):
            predicted = model.predict(inputs)
            if(i % 5 == 0):
                print("Refeed ", i)
            model.fit(predicted, predicted, epochs=1, batch_size=128, **params, validation_data=(x_val, x_val))

In [None]:
#ACTUAL MODEL TRAINING

inputs = Input(shape=(movies.shape[0],))
encoded = Dense(100, activation='selu')(inputs)
encoded = Dense(128, activation='selu')(encoded)
encoded = Dense(128, activation='selu')(encoded)
encoded = Dropout(0.65)(encoded)
encoded = Dense(128, activation='selu')(encoded)
encoded = Dense(100, activation='selu')(encoded)
decoded = Dense(movies.shape[0])(encoded)
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer=keras.optimizers.SGD(lr=0.005, momentum=0.9), loss = masked_mean_squared_error)

params = {'verbose': 0, 'callbacks': [TQDMNotebookCallback(leave_inner=True)]}
#history = autoencoder.fit(x_train, x_train, validation_data=(x_val, x_val), batch_size=128, epochs=100, **params)
train(autoencoder, x_train, 2, True)

In [None]:
train(autoencoder, x_train, 1, False)

In [None]:
#autoencoder.save("model_refeed.hf5")
autoencoder.save("model2.hf5")
#autoencoder.evaluate(x_test, x_test)
# val_loss 0.642