<a href="https://colab.research.google.com/github/nomomon/anime-recommendations/blob/main/User_Anime_Rating_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Kaggle API
username = str(input("username"))
key = str(input("key"))

f = open("kaggle.json", "w")
f.write('{"username":"'+username+'","key":"'+key+'"}')
f.close()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d hernan4444/anime-recommendation-database-2020

import os, re
from zipfile import ZipFile

with ZipFile('/content/anime-recommendation-database-2020.zip', 'r') as zipObj:
   zipObj.extractall()

def purge(dir, pattern):
    for f in os.listdir(dir):
        if pattern in f:
            os.remove(os.path.join(dir, f))

purge("/content/", ".zip")
purge("/content/", ".json")

Downloading anime-recommendation-database-2020.zip to /content
 99% 657M/661M [00:07<00:00, 96.2MB/s]
100% 661M/661M [00:07<00:00, 88.5MB/s]


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Data Preprocessing

In [None]:
anime = pd.read_csv("/content/anime.csv")
ratings = pd.read_csv("/content/animelist.csv")

In [None]:
ratings.head(5)

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4
2,0,242,10,1,4
3,0,4898,0,1,1
4,0,21,10,1,0


In [None]:
ratings["rating"].value_counts()

0     46827035
8     15422150
7     14244633
9     10235934
6      7543377
10     7144392
5      4029645
4      1845854
3       905700
2       545339
1       480688
Name: rating, dtype: int64

In the dataset, if a person hasn't left a rating to an anime, it was marked as a zero. Let's remove the zeros, as they will just introduce noise later on. Might as well remove the `watching_status` and `watched_episodes`. They are usefull pieces of data, but we won't use them in this tutorial.

In [None]:
ratings = ratings[ratings["rating"] > 0]
ratings = ratings.drop(columns = ["watching_status", "watched_episodes"])

In [None]:
num_users = ratings["user_id"].nunique()
num_anime = ratings["anime_id"].nunique()

print(f"There are a total of {num_users} users found.")
print(f"There are a total of {num_anime} anime found.")

print("Max user id:", ratings["user_id"].max())
print("Max anime id:", ratings["anime_id"].max())

There are a total of 313670 users found.
There are a total of 17172 anime found.
Max user id: 353404
Max anime id: 48456


If we take a look at the anime and user ids, we can notice that some values there are missing. Let's make a two helper functions, for easy conversion between dataset ids and embedding ids.

In [None]:
embId2user = sorted(ratings["user_id"].unique())
embId2anime = sorted(ratings["anime_id"].unique())

In [None]:
user2embId = {v: k for k, v in enumerate(embId2user)}
anime2embId = {v: k for k, v in enumerate(embId2anime)}

Now, let's make the dataset. It will look like a tuples of `(user_id, anime_embId, rating)`. We'll make 3 sets: `train`, `test` and `val`.

In [None]:
np.random.seed(seed = 42)

def makeDataSet(df, split=0.95):
    n = df.to_numpy()
    
    n = np.random.permutation(n)

    x = n[:, :2]
    y = n[:, 2]

    for i in range(x.shape[0]):
        x[i][0] = user2embId[x[i][0]]
        x[i][1] = anime2embId[x[i][1]]

    s1 = int(split * n.shape[0])
    s2 = s1 + int((1 - split) * n.shape[0] / 10)
    
    return (x[:s1], y[:s1], x[s1:s2], y[s1:s2], x[s2:], y[s2:])

In [None]:
x_train, y_train, x_test, y_test, x_val, y_val = makeDataSet(ratings)

# Models

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Matrix Factorization

In [None]:
class MatrixFactorizationModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super(MatrixFactorizationModel, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        self.user_embeddings = tf.keras.layers.Embedding(num_users, embedding_dim)
        self.item_embeddings = tf.keras.layers.Embedding(num_items, embedding_dim)

        self.user_biases = tf.keras.layers.Embedding(num_users, 1)
        self.item_biases = tf.keras.layers.Embedding(num_items, 1)

        self.bias = tf.Variable(tf.zeros([1]))

        self.dropout = tf.keras.layers.Dropout(.5)

    def call(self, inputs, training = False):
        user_ids = inputs[:, 0]
        item_ids = inputs[:, 1]

        user_embedding = self.user_embeddings(user_ids) + self.user_biases(user_ids)
        item_embedding = self.item_embeddings(item_ids) + self.item_biases(item_ids)

        if training:
            user_embedding = self.dropout(user_embedding, training = training)
            item_embedding = self.dropout(item_embedding, training = training)

        user_embedding = tf.reshape(user_embedding, [-1, self.embedding_dim])
        item_embedding = tf.reshape(item_embedding, [-1, self.embedding_dim])

        dot = tf.keras.layers.Dot(axes=1)([user_embedding, item_embedding]) + self.bias

        return dot

In [None]:
mf_model = MatrixFactorizationModel(num_users = num_users, 
                                    num_items = num_anime, 
                                    embedding_dim = 64)

mf_model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.MeanSquaredError(),
    metrics = [
        tf.keras.metrics.RootMeanSquaredError("RMSE")
    ],
    run_eagerly = True
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 3)

history = mf_model.fit(x = x_train, y = y_train, 
                    batch_size = 64, 
                    epochs = 100, 
                    steps_per_epoch = 1000,
                    callbacks = [callback],
                    # validation_data = (x_test, y_test),
                    # validation_steps = 10,
                    # validation_batch_size = 64
                    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [None]:
# mf_model.save_weights(
#     "/content/drive/MyDrive/AnimeRecommendations/MatrixFactorizationModel/model", overwrite=True
# )

In [None]:
mf_model.load_weights("/content/drive/MyDrive/AnimeRecommendations/MatrixFactorizationModel/model")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8394447f90>

## Neural Network

In [None]:
class NeuralNetworkModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super(NeuralNetworkModel, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        self.user_embeddings = tf.keras.layers.Embedding(num_users, embedding_dim)
        self.item_embeddings = tf.keras.layers.Embedding(num_items, embedding_dim)

        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='relu')

        self.concat = tf.keras.layers.Concatenate()
        self.dropout = tf.keras.layers.Dropout(.5)

    def call(self, inputs, training = False):
        user_ids = inputs[:, 0]
        item_ids = inputs[:, 1]

        user_embedding = self.user_embeddings(user_ids)
        item_embedding = self.item_embeddings(item_ids)

        if training:
            user_embedding = self.dropout(user_embedding, training = training)
            item_embedding = self.dropout(item_embedding, training = training)

        user_embedding = tf.reshape(user_embedding, [-1, self.embedding_dim])
        item_embedding = tf.reshape(item_embedding, [-1, self.embedding_dim])

        x = self.concat([user_embedding, item_embedding])
        x = self.dense1(x)
        x = self.dense2(x)

        return x

In [None]:
nn_model = NeuralNetworkModel(num_users = num_users, 
                              num_items = num_anime, 
                              embedding_dim = 64)

nn_model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.MeanSquaredError(),
    metrics = [
        tf.keras.metrics.RootMeanSquaredError("RMSE")
    ],
    run_eagerly = True
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 3)

history = nn_model.fit(x = x_train, y = y_train, 
                       batch_size = 64, 
                       epochs = 100, 
                       steps_per_epoch = 1000,
                       callbacks = [callback],
                     # validation_data = (x_test, y_test),
                     # validation_steps = 10,
                     # validation_batch_size = 64
                    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100


In [None]:
# nn_model.save_weights(
#     "/content/drive/MyDrive/AnimeRecommendations/NeuralNetworkModel/model", overwrite=True
# )

In [None]:
nn_model.load_weights("/content/drive/MyDrive/AnimeRecommendations/NeuralNetworkModel/model")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8347e4ebd0>

## Comparing the models

In [None]:
mf_model.evaluate(x = x_val, y = y_val)



[3.059866428375244, 1.749247431755066]

In [None]:
nn_model.evaluate(x = x_val, y = y_val)



[1.9271608591079712, 1.3882222175598145]