In [None]:
#@title Kaggle API

from IPython.display import clear_output

username = str(input("username: "))
key = str(input("key: "))

clear_output()

f = open("kaggle.json", "w")
f.write('{"username":"'+username+'","key":"'+key+'"}')
f.close()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d hernan4444/anime-recommendation-database-2020

import os, re
from zipfile import ZipFile

with ZipFile('/content/anime-recommendation-database-2020.zip', 'r') as zipObj:
   zipObj.extractall()

def purge(dir, pattern):
    for f in os.listdir(dir):
        if pattern in f:
            os.remove(os.path.join(dir, f))

purge("/content/", ".zip")
purge("/content/", ".json")

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Data Preprocessing

In [None]:
anime = pd.read_csv("/content/anime.csv")
interactions = pd.read_csv("/content/animelist.csv")

In [None]:
interactions = interactions.drop(columns = ["rating", "watched_episodes"])
interactions.head(5)

Unnamed: 0,user_id,anime_id,watching_status
0,0,67,1
1,0,6702,1
2,0,242,1
3,0,4898,1
4,0,21,1


Let's add a new feature `interaction`, which will be a 1 if the user had a positive interaction, 0.5 if we don't know, and 0 otherwise. We will model the interaction values using the `watching_status`.

```
1: Currently Watching
2: Completed
3: On Hold
4: Dropped
6: Plan to Watch
```

First, let's remove incorrect data.

In [None]:
interactions = interactions[interactions["watching_status"] >= 1] 
interactions = interactions[interactions["watching_status"] <= 6]
interactions = interactions[interactions["watching_status"] != 5]

We'll model interactions using the following function

In [None]:
def transform(df):
    df.loc[df["watching_status"] == 1, ["watching_status"]] = 0.5
    df.loc[df["watching_status"] == 2, ["watching_status"]] = 1.0
    df.loc[df["watching_status"] == 3, ["watching_status"]] = 0.5
    df.loc[df["watching_status"] == 4, ["watching_status"]] = 0.0
    df.loc[df["watching_status"] == 6, ["watching_status"]] = 1.0

    df.rename(columns = {'watching_status': 'interaction'}, inplace = True)

In [None]:
transform(interactions)

In [None]:
interactions["interaction"].value_counts()

1.0    96028444
0.5     8929172
0.0     4266591
Name: interaction, dtype: int64

In [None]:
num_users = interactions["user_id"].nunique()
num_anime = interactions["anime_id"].nunique()

print(f"There are a total of {num_users} users found.")
print(f"There are a total of {num_anime} anime found.")

print("Max user id:", interactions["user_id"].max())
print("Max anime id:", interactions["anime_id"].max())

There are a total of 325770 users found.
There are a total of 17562 anime found.
Max user id: 353404
Max anime id: 48492


If we take a look at the anime and user ids, we can notice that some values there are missing. Let's make a two helper functions, for easy conversion between dataset ids and embedding ids.

In [None]:
embId2user = sorted(interactions["user_id"].unique())
embId2anime = sorted(interactions["anime_id"].unique())

In [None]:
user2embId = {v: k for k, v in enumerate(embId2user)}
anime2embId = {v: k for k, v in enumerate(embId2anime)}

Now, let's make the dataset. It will look like a tuples of `(user_id, anime_embId, rating)`. We'll make 3 sets: `train`, `test` and `val`.

In [None]:
np.random.seed(seed = 42)

def makeDataSet(df, split=0.95):
    n = df.to_numpy()
    
    n = np.random.permutation(n)[:10000000]

    x = n[:, :2]
    y = n[:, 2]

    for i in range(x.shape[0]):
        x[i][0] = user2embId[x[i][0]]
        x[i][1] = anime2embId[x[i][1]]

    s1 = int(split * n.shape[0])
    s2 = s1 + int((1 - split) * n.shape[0] / 10)
    
    return (x[:s1], y[:s1], x[s1:s2], y[s1:s2], x[s2:], y[s2:])

In [None]:
x_train, y_train, x_test, y_test, x_val, y_val = makeDataSet(interactions)

# Models

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Matrix Factorization

In [None]:
class MatrixFactorizationModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super(MatrixFactorizationModel, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        self.user_embeddings = tf.keras.layers.Embedding(num_users, embedding_dim)
        self.item_embeddings = tf.keras.layers.Embedding(num_items, embedding_dim)

        self.user_biases = tf.keras.layers.Embedding(num_users, 1)
        self.item_biases = tf.keras.layers.Embedding(num_items, 1)

        self.bias = tf.Variable(tf.zeros([1]))

        self.dropout = tf.keras.layers.Dropout(.5)

    def call(self, inputs, training = False):
        user_ids = inputs[:, 0]
        item_ids = inputs[:, 1]

        user_embedding = self.user_embeddings(user_ids) + self.user_biases(user_ids)
        item_embedding = self.item_embeddings(item_ids) + self.item_biases(item_ids)

        if training:
            user_embedding = self.dropout(user_embedding, training = training)
            item_embedding = self.dropout(item_embedding, training = training)

        user_embedding = tf.reshape(user_embedding, [-1, self.embedding_dim])
        item_embedding = tf.reshape(item_embedding, [-1, self.embedding_dim])

        dot = tf.keras.layers.Dot(axes=1)([user_embedding, item_embedding]) + self.bias

        return tf.math.sigmoid(dot)

In [None]:
mf_model = MatrixFactorizationModel(num_users = num_users, 
                                    num_items = num_anime, 
                                    embedding_dim = 64)

mf_model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics = [
        tf.keras.metrics.RootMeanSquaredError("RMSE")
    ],
    run_eagerly = True
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 3)

history = mf_model.fit(x = x_train, y = y_train, 
                    batch_size = 64, 
                    epochs = 100, 
                    steps_per_epoch = 1000,
                    callbacks = [callback],
                    # validation_data = (x_test, y_test),
                    # validation_steps = 10,
                    # validation_batch_size = 64
                    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [None]:
# mf_model.save_weights(
#     "/content/drive/MyDrive/AnimeRecommendations/Interactions/MatrixFactorizationModel/model", overwrite=True
# )

In [None]:
mf_model.load_weights("/content/drive/MyDrive/AnimeRecommendations/Interactions/MatrixFactorizationModel/model")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0f504e6dd0>

## Neural Network

In [None]:
class NeuralNetworkModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_dim):
        super(NeuralNetworkModel, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        self.user_embeddings = tf.keras.layers.Embedding(num_users, embedding_dim)
        self.item_embeddings = tf.keras.layers.Embedding(num_items, embedding_dim)

        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')

        self.concat = tf.keras.layers.Concatenate()
        self.dropout = tf.keras.layers.Dropout(.5)

    def call(self, inputs, training = False):
        user_ids = inputs[:, 0]
        item_ids = inputs[:, 1]

        user_embedding = self.user_embeddings(user_ids)
        item_embedding = self.item_embeddings(item_ids)

        if training:
            user_embedding = self.dropout(user_embedding, training = training)
            item_embedding = self.dropout(item_embedding, training = training)

        user_embedding = tf.reshape(user_embedding, [-1, self.embedding_dim])
        item_embedding = tf.reshape(item_embedding, [-1, self.embedding_dim])

        x = self.concat([user_embedding, item_embedding])
        x = self.dense1(x)
        x = self.dense2(x)

        return x

In [None]:
nn_model = NeuralNetworkModel(num_users = num_users, 
                              num_items = num_anime, 
                              embedding_dim = 64)

nn_model.compile(
    optimizer = tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics = [
        tf.keras.metrics.RootMeanSquaredError("RMSE")
    ],
    run_eagerly = True
)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience = 3)

history = nn_model.fit(x = x_train, y = y_train, 
                       batch_size = 64, 
                       epochs = 100, 
                       steps_per_epoch = 1000,
                       callbacks = [callback],
                       
                     # validation_data = (x_test, y_test),
                     # validation_steps = 10,
                     # validation_batch_size = 64
                    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [None]:
# nn_model.save_weights(
#     "/content/drive/MyDrive/AnimeRecommendations/Interactions/NeuralNetworkModel/model", overwrite=True
# )

In [None]:
nn_model.load_weights("/content/drive/MyDrive/AnimeRecommendations/Interactions/NeuralNetworkModel/model")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0f504d6210>

## Comparing the models

In [33]:
mf_model.evaluate(x = x_val, y = y_val)



[0.2795208990573883, 0.23061399161815643]

In [34]:
nn_model.evaluate(x = x_val, y = y_val)



[0.25021466612815857, 0.21893906593322754]