In [None]:
! Y | apt-get install libblas-dev

In [None]:
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_anime = pd.read_csv("../input/anime-recommendations-database/anime.csv")
df_rating = pd.read_csv("../input/anime-recommendations-database/rating.csv")

In [None]:
df_anime.head()

In [None]:
df_rating.head()

In [None]:
def get_anime_feature_map(df_anime):
    ## cleaning names
    # df_anime['name'] = df_anime['name'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', re.sub(r'&#(\d)+;', '', x)))
    # df_anime = df_anime[df_anime['name'] != '']
    
    # ## Imputing episodes based on type of anime(mean value)
    tmp = df_anime[df_anime.episodes != 'Unknown'][['type', 'episodes']]
    tmp['episodes'] = tmp['episodes'].astype(int)
    tmp = tmp.groupby('type').mean().to_dict()['episodes']
    df_anime['episodes'] = df_anime.apply(lambda x: tmp.get(x['type'], 1) if (x['episodes'] == 'Unknown') else x['episodes'], axis=1)
    df_anime['episodes'] = df_anime['episodes'].astype(int)
    
    ## Imputing rating with the mean rating
    df_anime['rating'] = df_anime['rating'].fillna(df_anime['rating'].mean())
    
    #Imputing genre with extra '' class
    df_anime['genre'] = df_anime['genre'].apply(lambda x: [g.strip() for g in (x.split(',') if (type(x) == str) else [''])])
    mat = df_anime.to_numpy()
    genres = mat[:,2]
    
    mlb = MultiLabelBinarizer()
    mlb.fit(genres)
    
    ## Imputing type column with extra '' class
    df_anime['type'] = df_anime['type'].fillna('')
    
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(np.array(list(set(df_anime['type']))).reshape(-1, 1))
    
    df_anime['genre'] = df_anime['genre'].apply(lambda x: mlb.transform([x])[0])
    df_anime['type'] = df_anime['type'].apply(lambda x: ohe.transform([[x]])[0])
    
    ## normalize ratings and members
    df_anime['rating'] = (df_anime['rating'] - df_anime['rating'].min())/(df_anime['rating'].max()-df_anime['rating'].min())
    df_anime['members'] = (df_anime['members'] - df_anime['members'].min())/(df_anime['members'].max()-df_anime['members'].min())
    
    ## generating feature_map
    anime_feature_map = {}
    for idx, row in tqdm(df_anime.iterrows()):
        anime_feature_map[row["anime_id"]] = list(row["genre"]) + list(row["type"]) + [row["rating"], row["members"]]
        
    return anime_feature_map, mlb, ohe

In [None]:
anime_feature_map, mlb, ohe = get_anime_feature_map(df_anime)

In [None]:
# df_rating.head()
df_rating['anime_features'] = df_rating['anime_id'].apply(lambda x: anime_feature_map.get(x))
df_rating = df_rating[~df_rating.anime_features.isna()]
df_rating = df_rating[df_rating['rating'] != -1]

In [None]:
df_rating.head()

In [None]:
user_count = df_rating.groupby('user_id').count()['rating']
df_rating = df_rating[df_rating['user_id'].apply(lambda x: 10 <= user_count[x] <= 30)]

In [None]:
user_idx_map = {u: e for e, u in enumerate(df_rating.user_id.unique())}
anime_idx_map = {i: e for e, i in enumerate(df_rating.anime_id.unique())}

In [None]:
df_rating["user_idx"] = df_rating["user_id"].apply(lambda x: user_idx_map[x])
df_rating["anime_idx"] = df_rating["anime_id"].apply(lambda x: anime_idx_map[x])

In [None]:
print(df_rating["user_idx"].max())
print(df_rating["anime_idx"].max())

In [None]:
df_rating.head()

In [None]:
from sklearn.model_selection import train_test_split

df_rating_train, df_rating_test = train_test_split(df_rating, test_size=0.1, stratify=df_rating.user_id, random_state=93)

In [None]:
X_train = [df_rating_train['user_idx'].tolist(), df_rating_train['anime_idx'].tolist(), df_rating_train['anime_features'].tolist()]
y_train = df_rating_train['rating']

X_test = [df_rating_test['user_idx'].tolist(), df_rating_test['anime_idx'].tolist(), df_rating_test['anime_features'].tolist()]
y_test = df_rating_test['rating']

In [None]:
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Embedding, Dot, Concatenate, Add, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
def create_model(n_users, user_embed_size_dot, user_embed_size_concat, n_items, item_embed_size, item_feature_len, regularization=1e-4):
     
    item_features = Input(shape=(item_feature_len, ), name="item_features")
    user_inp = Input(shape=(1, ), dtype='int8', name="user_embed")
    user_embed = Embedding(n_users, 
                           user_embed_size_dot, 
                           name='user_embed_mat',
                           embeddings_initializer="glorot_uniform", 
                           embeddings_regularizer=keras.regularizers.l2(regularization))(user_inp)
    user_embed_bias = Embedding(n_users, 
                                1, 
                                name='user_embed_bias_mat',
                                embeddings_initializer="glorot_uniform")(user_inp)
    user_embed_c = Embedding(n_users, 
                             user_embed_size_concat, 
                             name='user_embed_c_mat',
                             embeddings_initializer="glorot_uniform", 
                             embeddings_regularizer=keras.regularizers.l2(regularization))(user_inp)
    
    item_inp = Input(shape=(1, ), dtype='int8', name="item_embed")
    item_embed = Embedding(n_items, 
                           item_embed_size, 
                           name='item_embed_mat',
                           embeddings_initializer="glorot_uniform", 
                           embeddings_regularizer=keras.regularizers.l2(regularization))(item_inp)
    item_embed_bias = Embedding(n_items, 
                                1, 
                                name='item_embed_bias_mat',
                                embeddings_initializer="glorot_uniform")(item_inp)
    
    user_item_dot = Dot(axes=2, name='user_item_dot')([user_embed, item_embed])
    
    user_item_dot = Add()([user_item_dot, user_embed_bias, item_embed_bias])
    user_item_dot = Flatten()(user_item_dot)
    user_embed_c = Flatten()(user_embed_c)
    
    user_item_concat = Concatenate(axis=1)([user_embed_c, item_features])
    
    hidden1 = Dense(8, activation="relu")(user_item_concat)
    hidden1 = BatchNormalization()(hidden1)
    hidden1 = Dropout(0.2)(hidden1)
    
    dot_hidden1_concat = Concatenate(axis=1)([hidden1, user_item_dot])
    
    output = Dense(1, activation="relu")(dot_hidden1_concat)
    
    model = Model([user_inp, item_inp, item_features], output)
    
    return model
    

In [None]:
N_USERS = df_rating.user_idx.max()
N_ITEMS = df_rating.anime_idx.max()
USER_EMBEDDING_SIZE_DOT = 20
USER_EMBEDDING_SIZE_CONCAT = 20
ITEM_EMBEDDING_SIZE = 20
ITEM_FEATURE_LEN = 53

model = create_model(N_USERS, USER_EMBEDDING_SIZE_DOT, USER_EMBEDDING_SIZE_CONCAT, N_ITEMS, ITEM_EMBEDDING_SIZE, ITEM_FEATURE_LEN)
model.summary()

In [None]:
model.compile(Adam(1e-3), loss="mse", metrics=["mae"])

In [None]:
# callbacks defined

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 5
    lrate = initial_lrate * (drop**((1 + epoch)/epochs_drop))
    return lrate

lrate_scheduler = LearningRateScheduler(step_decay)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_chkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

# model fitting
model.fit(X_train, y_train, batch_size=32, epochs=50, validation_split=0.1, callbacks=[early_stop, model_chkpoint, lrate_scheduler])

In [None]:
model.predict(X_test)

In [None]:
df_rating_test['prediction'] = [t[0] for t in model.predict(X_test)]

In [None]:
df_rating_test.head()

In [None]:
print("Test MAE: {}".format(sum(abs(df_rating_test["rating"] - df_rating_test["prediction"]))/len(df_rating_test)))