In [21]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
import os
import random
print('import complete')

tf.set_random_seed(1); np.random.seed(1); random.seed(1) #set random seeds for reproducibility

inputDir = "C:\\Users\\u1189580\\Downloads\\movielens-20m-dataset"
ratingsPath = os.path.join(inputDir, 'rating.csv')
ratingsDF = pd.read_csv(ratingsPath, usecols=['userId', 'movieId', 'rating'])
moviesDf = pd.read_csv(os.path.join(inputDir, 'movie.csv'), usecols=['movieId', 'Title', 'Year'])
df = ratingsDF.merge(moviesDf, on='movieId').sort_values(by='userId')
#the mean rating: 3.5255285642993797
df['y'] = df['rating'].astype(float) - 3.5255285642993797
print('DF compiled')

nMovies = len(df.movieId.unique())
nUsers = len(df.userId.unique())
print("{1:,} distinct users rated {0:,} distinct movies (total ratings = {2:,})".format(nMovies, nUsers, len(df)))


import complete
DF compiled
138,493 distinct users rated 26,744 distinct movies (total ratings = 20,000,263)


In [None]:
hidden_units = (32,4)
movie_embedding_size = 8
user_embedding_size = 8

#each instance will consist of two inputs: a single user is and a single movie ID
user_id_input = keras.Input(shape=(1,), name = 'user_id')
movie_id_input = keras.Input(shape=(1,), name = 'movie_id')
user_embedded = keras.layers.Embedding(df.userId.max()+1, user_embedding_size,
                                     input_length = 1, name = 'user_embeding')(user_id_input)
movie_embedded = keras.layers.Embedding(df.movieId.max()+1, movie_embedding_size,
                                      input_length = 1, name='movie_embeding')(movie_id_input)
#concatenate the embeddings (and remove the useless extra dimension)
concatenated = keras.layers.Concatenate()([user_embedded, movie_embedded])
out = keras.layers.Flatten()(concatenated)

#add one or more hidden layers
for n_hidden in hidden_units:
    out = keras.layers.Dense(n_hidden, activation = 'relu')(out)

#a single output: our predicted rating
out = keras.layers.Dense(1, activation='linear', name='prediction')(out)

model = keras.Model(
        inputs = [user_id_input, movie_id_input],
        outputs = out,)
model.summary(line_length=88)

model.compile(tf.train.AdamOptimizer(0.005),loss='MSE',metrics=['MAE'])

history = model.fit(
    [df.userId, df.movieId],
    df.y,
    batch_size=5000,
    epochs=20,
    verbose=0,
    validation_split=0.5,
);

________________________________________________________________________________________
Layer (type)                 Output Shape       Param #   Connected to                  
user_id (InputLayer)         (None, 1)          0                                       
________________________________________________________________________________________
movie_id (InputLayer)        (None, 1)          0                                       
________________________________________________________________________________________
user_embeding (Embedding)    (None, 1, 8)       1107952   user_id[0][0]                 
________________________________________________________________________________________
movie_embeding (Embedding)   (None, 1, 8)       1050104   movie_id[0][0]                
________________________________________________________________________________________
concatenate_4 (Concatenate)  (None, 1, 16)      0         user_embeding[0][0]           
                     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

df_train, df_val = train_test_split(df, test_size=.05, random_state=1)

def get_metrics(y_true, y_pred):
    return metrics.mean_absolute_error(y_true, y_pred), metrics.mean_squared_error(y_true, y_pred)

mean_rating = df_train['rating'].mean()
print("Average rating in training set is {:.2f} stars".format(mean_rating))

y_true = df_val['rating'].values
always_mean = np.full(y_true.shape, mean_rating)

mae, mse = get_metrics(y_true, always_mean)
print("Always predicting global average rating results in Mean Absolute Error={:.2f}, Mean Squared Error={:.2f}".format(
    mae, mse))

movies = movies_df.copy().set_index('movieId')
mean_per_movie = df_train.groupby('movieId')['rating'].mean()
movies['mean_rating'] = mean_per_movie
ratings_per_movie = df_train.groupby('movieId').size()
movies['n_ratings'] = ratings_per_movie
# There are a few movies in the validation set not present in the training set. We'll just use the global
# mean rating in their case.
y_movie_mean = df_val.join(mean_per_movie, on='movieId', rsuffix='mean')['ratingmean'].fillna(mean_rating).values

mae, mse = get_metrics(y_true, y_movie_mean)
print("Predicting mean per movie results in Mean Absolute Error={:.2f}, Mean Squared Error={:.2f}".format(mae, mse))

fig, ax = plt.subplots(figsize=(15, 6))
ax.plot(history.epoch, history.history['val_mean_absolute_error'], label='Validation MAE')
ax.plot(history.epoch, history.history['mean_absolute_error'], label='Training MAE')
ax.set_xlabel('Epoch')
ax.set_ylabel('Mean Absolute Error')
ax.set_xlim(left=0, right=history.epoch[-1])
baseline_mae = 0.73
ax.axhline(baseline_mae, ls='--', label='Baseline', color='#002255', alpha=.5)
ax.grid()
fig.legend();