In [None]:
# To be used on Google Colaboratory for TPU accelerator
%load_ext tensorboard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

# If used on google colab, run on TPU
TPU_INIT = True

if TPU_INIT:
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime!')
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  tpu_strategy = tf.distribute.TPUStrategy(tpu)
else:
  !nvidia-smi
print("Tensorflow version " + tf.__version__)

In [None]:
# get the anime and ratings from the preprocessed data
anime = pd.read_csv("/content/drive/MyDrive/anime.csv")
ratings = pd.read_csv("/content/drive/MyDrive/ratings.csv")

print(f"There are {len(ratings)} ratings")

In [None]:
# now we need to create an encoding in order to use the Embedding layer

user_ids = ratings['user_id'].unique().tolist()
user2encoded = {x: i for i, x in enumerate(user_ids)}
encoded2user = {i: x for i, x in enumerate(user_ids)}

anime_ids = ratings['anime_id'].unique().tolist()
anime2encoded = {x: i for i, x in enumerate(anime_ids)}
encoded2anime = {i: x for i, x in enumerate(anime_ids)}

ratings['user'] = ratings['user_id'].map(user2encoded)
ratings['anime'] = ratings['anime_id'].map(anime2encoded)

num_users = len(user_ids)
num_anime = len(anime_ids)

max_rating = ratings['rating'].max()
min_rating = ratings['rating'].min()

print(f"{max_rating} max rating, {min_rating} min rating, {num_users} users, {num_anime} anime")

In [None]:
# normalizing data and splitting into train/validation

ratings = ratings.sample(frac=1, random_state=69)
x = ratings[['user', 'anime']].values
y = ratings['rating'].apply(lambda x: (x-min_rating)/(max_rating-min_rating)).values
train_indices = int(0.9 * ratings.shape[0])

x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping for the training phase, when we just update parameters
# I chose patience 2, because the model converges kinda fast
early_stopping = EarlyStopping(patience = 2, monitor='val_loss',
                               mode='min', restore_best_weights=True)

batch_size = 8192
start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005

rampup_epochs = 2
sustain_epochs = 0
exp_decay = .8

if TPU_INIT:
  max_lr = max_lr * tpu_strategy.num_replicas_in_sync
  batch_size = batch_size * tpu_strategy.num_replicas_in_sync

# Learning rate scheduler for better training peace
# You are free to play with the parameters for this, for me those worked the best
def lrfn(epoch):
  if epoch < rampup_epochs:
    return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
  elif epoch < rampup_epochs + sustain_epochs:
    return max_lr
  else:
    return (max_lr - min_lr) * exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr

lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=True)

my_callbacks = [
    lr_callback,
    early_stopping,
]

In [None]:

import tensorflow as tf
from tensorflow import keras
from keras import layers, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dot, Add, Activation, BatchNormalization, Flatten, Input, Dense

def RecommenderNet():
    embedding_size = 32

    user = Input(name = 'user', shape = [1])
    user_embedding = Embedding(name = 'user_embedding',
                       input_dim = num_users,
                       output_dim = embedding_size,
                       embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6))(user)
    user_bias = Embedding(name = 'user_bias',
                          input_dim = num_users,
                          output_dim = 1)(user)

    anime = Input(name = 'anime', shape = [1])
    anime_embedding = Embedding(name = 'anime_embedding',
                       input_dim = num_anime,
                       output_dim = embedding_size,
                       embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6))(anime)
    anime_bias = Embedding(name = 'anime_bias',
                           input_dim = num_anime,
                           output_dim = 1)(anime)

    x = Dot(name = 'dot_product', normalize = True, axes = 2)([user_embedding, anime_embedding])
    x = Add(name = 'added')([x, anime_bias, user_bias])
    x = Flatten()(x)

    x = Dense(1, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)

    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss='binary_crossentropy', metrics=["mae", "mse"], optimizer='Adam')
    return model
if TPU_INIT:
  with tpu_strategy.scope(): # creating the model in the TPUStrategy scope means we will train the model on the TPU
    model = RecommenderNet()
else:
  model = RecommenderNet()
model.summary()

In [None]:
history = model.fit(
    x=[x_train[:,0], x_train[:, 1]],
    y=y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=([x_val[:,0], x_val[:, 1]], y_val),
    callbacks=my_callbacks
)

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper right")
plt.show()

In [None]:
anime_weights = model.get_weights()[1]
anime['MAL_ID'] = anime['MAL_ID'].apply(lambda x: anime2encoded[x])

In [None]:
import keras.backend as K

def euclidean_distance_loss(y_true, y_pred):
    """
    The Euclidean distance between two points in Euclidean space.
    # Arguments
        y_true: tensor with true targets.
        y_pred: tensor with predicted targets.
    # Returns
        float type Euclidean distance between two data points.
    """
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1))

def angle(y_true, y_pred):
  return tf.keras.losses.cosine_similarity(y_true, y_pred)

def find_anime(name):
    return anime.loc[anime['Name'] == name]['MAL_ID'].values[0]

def similar(anime_list):
    in_list = []
    for anime_name in anime_list:
        anime_id = find_anime(anime_name)
        in_list.append(anime_id)
    distance = np.array([0]*num_anime)
    for anime_id in in_list:
        distance += angle(anime_weights, anime_weights[anime_id])
    top10 = pd.Series(distance).sort_values().drop(labels=in_list).head(10)
    print(top10)
    series = top10.index.values
    result = pd.DataFrame(columns = ['Name', 'Genders', 'Description'])
    print(result)
    for i, anime_id in enumerate(series):
        row = anime.loc[anime['MAL_ID'] == anime_id]
        new_row = {"Name": row['Name'].values[0],
                  "Genders": row['Genres'].values[0],
                  "Description": row['Description'].values[0]}
        result.loc[i] = new_row
    return result


In [None]:
similar(['Naruto'])