In [2]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [11]:
df_anime = pd.read_csv("../input/anime-recommendations-database/anime.csv")
df_rating = pd.read_csv("../input/anime-recommendations-database/rating.csv")

In [12]:
df_anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [13]:
df_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [14]:
def get_anime_feature_map(df_anime):
    ## cleaning names
    # df_anime['name'] = df_anime['name'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', re.sub(r'&#(\d)+;', '', x)))
    # df_anime = df_anime[df_anime['name'] != '']
    
    # ## Imputing episodes based on type of anime(mean value)
    tmp = df_anime[df_anime.episodes != 'Unknown'][['type', 'episodes']]
    tmp['episodes'] = tmp['episodes'].astype(int)
    tmp = tmp.groupby('type').mean().to_dict()['episodes']
    df_anime['episodes'] = df_anime.apply(lambda x: tmp.get(x['type'], 1) if (x['episodes'] == 'Unknown') else x['episodes'], axis=1)
    df_anime['episodes'] = df_anime['episodes'].astype(int)
    
    ## Imputing rating with the mean rating
    df_anime['rating'] = df_anime['rating'].fillna(df_anime['rating'].mean())
    
    #Imputing genre with extra '' class
    df_anime['genre'] = df_anime['genre'].apply(lambda x: [g.strip() for g in (x.split(',') if (type(x) == str) else [''])])
    mat = df_anime.to_numpy()
    genres = mat[:,2]
    
    mlb = MultiLabelBinarizer()
    mlb.fit(genres)
    
    ## Imputing type column with extra '' class
    df_anime['type'] = df_anime['type'].fillna('')
    
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(np.array(list(set(df_anime['type']))).reshape(-1, 1))
    
    df_anime['genre'] = df_anime['genre'].apply(lambda x: mlb.transform([x])[0])
    df_anime['type'] = df_anime['type'].apply(lambda x: ohe.transform([[x]])[0])
    
    ## normalize ratings and members
    df_anime['rating'] = (df_anime['rating'] - df_anime['rating'].min())/(df_anime['rating'].max()-df_anime['rating'].min())
    df_anime['members'] = (df_anime['members'] - df_anime['members'].min())/(df_anime['members'].max()-df_anime['members'].min())
    
    ## generating feature_map
    anime_feature_map = {}
    for idx, row in tqdm(df_anime.iterrows()):
        anime_feature_map[row["anime_id"]] = list(row["genre"]) + list(row["type"]) + [row["rating"], row["members"]]
        
    return anime_feature_map, mlb, ohe

In [15]:
anime_feature_map, mlb, ohe = get_anime_feature_map(df_anime)

12294it [00:01, 8303.78it/s]


In [16]:
# df_rating.head()
df_rating['anime_features'] = df_rating['anime_id'].apply(lambda x: anime_feature_map.get(x))
df_rating = df_rating[~df_rating.anime_features.isna()]
df_rating = df_rating[df_rating['rating'] != -1]

In [17]:
df_rating.head()

Unnamed: 0,user_id,anime_id,rating,anime_features
47,1,8074,10,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
81,1,11617,10,"[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, ..."
83,1,11757,10,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
101,1,15451,10,"[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, ..."
153,2,11771,10,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [18]:
user_count = df_rating.groupby('user_id').count()['rating']
df_rating = df_rating[df_rating['user_id'].apply(lambda x: 5 <= user_count[x] <= 100)]

In [19]:
user_idx_map = {u: e for e, u in enumerate(df_rating.user_id.unique())}
anime_idx_map = {i: e for e, i in enumerate(df_rating.anime_id.unique())}

In [20]:
df_rating["user_idx"] = df_rating["user_id"].apply(lambda x: user_idx_map[x])
df_rating["anime_idx"] = df_rating["anime_id"].apply(lambda x: anime_idx_map[x])

In [21]:
print(df_rating["user_idx"].max())
print(df_rating["anime_idx"].max())

41171
7185


In [22]:
df_rating.head()

Unnamed: 0,user_id,anime_id,rating,anime_features,user_idx,anime_idx
156,3,20,8,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
157,3,154,6,"[0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,1
158,3,170,9,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,2
159,3,199,10,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,3
160,3,225,9,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,4


In [23]:
from sklearn.model_selection import train_test_split

df_rating_train, df_rating_test = train_test_split(df_rating, test_size=0.1, stratify=df_rating.user_id, random_state=93)

In [59]:
X_train = [df_rating_train['user_idx'].values, df_rating_train['anime_idx'].values, np.array([np.array(t) for t in df_rating_train['anime_features']])]
y_train = df_rating_train['rating'].values

X_test = [df_rating_test['user_idx'].values, df_rating_test['anime_idx'].values, np.array([np.array(t) for t in df_rating_test['anime_features']])]
y_test = df_rating_test['rating'].values

In [61]:
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, Embedding, Dot, Concatenate, Add, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [79]:
def create_model(n_users, user_embed_size_dot, user_embed_size_concat, n_items, item_embed_size, item_feature_len, regularization=1e-4):
     
    item_features = Input(shape=(item_feature_len, ), name="item_features")
    user_inp = Input(shape=(1, ), dtype='int32', name="user_embed")
    user_embed = Embedding(n_users, 
                           user_embed_size_dot, 
                           name='user_embed_mat',
                           embeddings_initializer="glorot_uniform", 
                           embeddings_regularizer=keras.regularizers.l2(regularization))(user_inp)
    user_embed_bias = Embedding(n_users, 
                                1, 
                                name='user_embed_bias_mat',
                                embeddings_initializer="glorot_uniform")(user_inp)
    user_embed_c = Embedding(n_users, 
                             user_embed_size_concat, 
                             name='user_embed_c_mat',
                             embeddings_initializer="glorot_uniform", 
                             embeddings_regularizer=keras.regularizers.l2(regularization))(user_inp)
    
    item_inp = Input(shape=(1, ), dtype='int32', name="item_embed")
    item_embed = Embedding(n_items, 
                           item_embed_size, 
                           name='item_embed_mat',
                           embeddings_initializer="glorot_uniform", 
                           embeddings_regularizer=keras.regularizers.l2(regularization))(item_inp)
    item_embed_bias = Embedding(n_items, 
                                1, 
                                name='item_embed_bias_mat',
                                embeddings_initializer="glorot_uniform")(item_inp)
    
    user_item_dot = Dot(axes=2, name='user_item_dot')([user_embed, item_embed])
    
    user_item_dot = Add()([user_item_dot, user_embed_bias, item_embed_bias])
    user_item_dot = Flatten()(user_item_dot)
    user_embed_c = Flatten()(user_embed_c)
    
    user_item_concat = Concatenate(axis=1)([user_embed_c, item_features])
    
    hidden1 = Dense(8, activation="relu")(user_item_concat)
    hidden1 = BatchNormalization()(hidden1)
    hidden1 = Dropout(0.2)(hidden1)
    
    dot_hidden1_concat = Concatenate(axis=1)([hidden1, user_item_dot])
    
    output = Dense(1, activation="relu")(dot_hidden1_concat)
    
    model = Model([user_inp, item_inp, item_features], output)
    
    return model
    

In [80]:
N_USERS = df_rating.user_idx.max() + 1
N_ITEMS = df_rating.anime_idx.max() + 1
USER_EMBEDDING_SIZE_DOT = 20
USER_EMBEDDING_SIZE_CONCAT = 20
ITEM_EMBEDDING_SIZE = 20
ITEM_FEATURE_LEN = 53

model = create_model(N_USERS, USER_EMBEDDING_SIZE_DOT, USER_EMBEDDING_SIZE_CONCAT, N_ITEMS, ITEM_EMBEDDING_SIZE, ITEM_FEATURE_LEN)
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_embed (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embed_c_mat (Embedding)    (None, 1, 20)        823440      user_embed[0][0]                 
__________________________________________________________________________________________________
flatten_15 (Flatten)            (None, 20)           0           user_embed_c_mat[0][0]           
__________________________________________________________________________________________________
item_features (InputLayer)      [(None, 53)]         0                                            
____________________________________________________________________________________________

In [81]:
model.compile(Adam(1e-3), loss="mse", metrics=["mae"])

In [82]:
# callbacks defined

# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 5
    lrate = initial_lrate * (drop**((1 + epoch)/epochs_drop))
    return lrate

lrate_scheduler = LearningRateScheduler(step_decay)
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_chkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

# model fitting
model.fit(X_train, y_train, batch_size=32, epochs=50, validation_split=0.1, callbacks=[early_stop, model_chkpoint, lrate_scheduler])

Train on 1272119 samples, validate on 141347 samples
Epoch 1/50




Epoch 00001: val_loss improved from inf to 1.64769, saving model to best_model.h5
Epoch 2/50
Epoch 00002: val_loss improved from 1.64769 to 1.60568, saving model to best_model.h5
Epoch 3/50
Epoch 00003: val_loss improved from 1.60568 to 1.58443, saving model to best_model.h5
Epoch 4/50
Epoch 00004: val_loss improved from 1.58443 to 1.57598, saving model to best_model.h5
Epoch 5/50
Epoch 00005: val_loss improved from 1.57598 to 1.56905, saving model to best_model.h5
Epoch 6/50
Epoch 00006: val_loss improved from 1.56905 to 1.56626, saving model to best_model.h5
Epoch 7/50
Epoch 00007: val_loss improved from 1.56626 to 1.56256, saving model to best_model.h5
Epoch 8/50
Epoch 00008: val_loss improved from 1.56256 to 1.56150, saving model to best_model.h5
Epoch 9/50
Epoch 00009: val_loss improved from 1.56150 to 1.56019, saving model to best_model.h5
Epoch 10/50
Epoch 00010: val_loss improved from 1.56019 to 1.55877, saving model to best_model.h5
Epoch 11/50
Epoch 00011: val_loss improved 

<tensorflow.python.keras.callbacks.History at 0x7f56844d0850>

In [None]:
model.predict(X_test)

In [None]:
df_rating_test['prediction'] = [t[0] for t in model.predict(X_test)]

In [None]:
df_rating_test.head()

In [None]:
print("Test MAE: {}".format(sum(abs(df_rating_test["rating"] - df_rating_test["prediction"]))/len(df_rating_test)))

### Upvote if you liked the approach.