In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import keras
from keras import layers
from zipfile import ZipFile
from pathlib import Path

2023-11-26 11:50:08.174124: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-26 11:50:08.211758: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-26 11:50:08.211801: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-26 11:50:08.213002: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-26 11:50:08.219242: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-26 11:50:08.219763: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [None]:
TRAINING_SPLIT_RATIO = 0.9
EMBEDDING_SIZE = 50
TOP_K = 10

In [None]:
movielens_dataset_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_zipped_file = keras.utils.get_file(
    fname="ml-latest-small.zip",
    origin=movielens_dataset_url,
    extract=True,
    archive_format="zip")

movielens_dir = Path(movielens_zipped_file).parent / "ml-latest-small"

In [None]:
ratings_file = movielens_dir / "ratings.csv"
rating_df = pd.read_csv(ratings_file)

user_id = rating_df.userId.unique().tolist()
user_to_index = { user: index for index, user in enumerate(user_id) }
index_to_user = { index: user for index, user in enumerate(user_id) }

movie_id = rating_df.movieId.unique().tolist()
movie_to_index = { movie: index for index, movie in enumerate(movie_id) }
index_to_movie = { index: movie for index, movie in enumerate(movie_id) }

rating_df["user"]   = rating_df.userId.map(user_to_index)
rating_df["movie"]  = rating_df.movieId.map(index_to_movie)
rating_df.rating    = rating_df.rating.values.astype(np.float32)

num_users, num_movies  = len(user_id), len(movie_id)
min_rating, max_rating = min(rating_df.rating), max(rating_df.rating)

print(f"num_users:{num_users}, num_movies:{num_movies}, min_rating:{min_rating}, max_rating:{max_rating}")

In [None]:
normalization_func = lambda x: (x - min_rating) / (max_rating - min_rating)

df = rating_df.sample(frac=1, random_state=42)
x  = df[["user", "movie"]].values
y  = df.rating.apply(normalization_func).values

train_split = int(TRAINING_SPLIT_RATIO * df.shape[0])

x_train = x[:train_split]
x_val   = x[train_split:]
y_train = y[:train_split]
y_val   = y[train_split:]

In [None]:
class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super().__init__(**kwargs)

        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_bias = layers.Embedding(input_dim=num_users, output_dim=1)
        self.movie_bias = layers.Embedding(input_dim=num_movies, output_dim=1)

        self.user_embedding = layers.Embedding(
            input_dim=num_users,
            output_dim=embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_embedding = layers.Embedding(
            input_dim=num_movies,
            output_dim=embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )


    def call(self, inputs):

        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)

        x = dot_user_movie + user_bias + movie_bias
        return tf.nn.sigmoid(x)


model = RecommenderNet(
    num_users,
    num_movies,
    EMBEDDING_SIZE,
    name="recommender-net"
  )

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
)

In [None]:
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)

In [None]:
val_loss = history.history["val_loss"]
loss = history.history["loss"]

plt.title(f"{model.name} loss")
plt.ylabel("loss")
plt.xlabel("epoch")

plt.plot(loss)
plt.plot(val_loss)

plt.legend(["train", "test"], loc="upper right")
plt.show()

In [None]:
movie_df = pd.read_csv(movielens_dir / "movies.csv")

user_id = df.userId.sample(1).iloc[0]
movies_watched = df[df.userId == user_id]

movies_not_watched = ~movie_df.movieId.isin(movies_watched.movieId.values)
movies_not_watched = movie_df[movies_not_watched].movieId
movies_not_watched = set(movies_not_watched) & set(movie_to_index.keys())
movies_not_watched = list(movies_not_watched)
movies_not_watched = [[movie_to_index.get(movie)] for movie in movies_not_watched]

user_encoder = user_to_index.get(user_id)

inputs = ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
inputs = np.hstack(inputs)

ratings = model.predict(inputs)
ratings = ratings.flatten()
ratings = ratings.argsort()[-TOP_K:][::-1]

recommended_movie_ids = [
    index_to_movie.get(movies_not_watched[rating][0]) for rating in ratings
]

print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)

movie_df_rows = movie_df[movie_df.movieId.isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movie_df[movie_df.movieId.isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)