# Imports

In [1]:
import pydot
import graphviz
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error

In [2]:
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, merge, concatenate, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import model_to_dot
from IPython.display import SVG
from keras.layers import dot
from tensorflow.keras.utils import plot_model

2021-08-31 14:27:08.679019: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-31 14:27:08.679126: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [89]:
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error,mean_squared_error

# Dataset

In [5]:
# dataset = pd.read_csv('../data/processed_data/active_users_df_10PlusRatings_partial.csv')
# anime_df = pd.read_csv('../data/raw_data/anime.csv')
# dataset = dataset.sort_values(["user_id", "anime_id"], ascending=(True, True))

In [6]:
# 5M rows from dataset
dataset_full = pd.read_csv('../data/processed_data/active_users_df.csv')

In [None]:
dataset_full = dataset_full.sort_values(["user_id", "anime_id"], ascending=(True, True))
dataset = dataset_full.sample(frac=0.1, random_state=42)

## Normalize rating - divide by 10 before splitting into train/test


In [None]:
norm_rating = dataset.rating/10
norm_rating

In [None]:
dataset['rating'] = norm_rating


In [None]:
dataset.head()

### Reduce size of data inside df

In [154]:
def df_optimized(df, verbose=True, **kwargs):
    """
    Reduces size of dataframe by downcasting numerical columns
    :param df: input dataframe
    :param verbose: print size reduction if set to True
    :param kwargs:
    :return:
    """
    in_size = df.memory_usage(index=True).sum()
    for type in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=type))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=type)
            if type == "float":
                df[col] = pd.to_numeric(df[col], downcast="integer")
    out_size = df.memory_usage(index=True).sum()
    ratio = (1 - round(out_size / in_size, 2)) * 100
    GB = out_size / 1000000000
    if verbose:
        print("optimized size by {} % | {} GB".format(ratio, GB))
    return df

In [166]:
dataset = df_optimized(dataset_full)

optimized size by 6.000000000000005 % | 3.525976768 GB


In [169]:
dataset.dtypes

user_id        int32
anime_id       int32
rating       float32
mean_pred    float32
dtype: object

In [168]:
dataset.shape

(56738161, 4)

In [9]:
dataset.head()

Unnamed: 0,user_id,anime_id,rating
9980816,62125,38062,3
3744929,23493,298,5
54195347,337438,10628,7
21400486,133375,4052,7
13647532,84974,4246,8


In [10]:
dataset.anime_id.nunique()

16082

In [11]:
dataset.user_id.nunique()

177913

# Order dataset by user_id and anime_id

In [12]:
anime_id_to_new_id = dict()
id = 1

In [13]:
%%time
for index, row in dataset.iterrows():
    if anime_id_to_new_id.get(row['anime_id']) is None:
        anime_id_to_new_id[row['anime_id']] = id
        dataset.at[index, 'anime_id'] = id
        id += 1
    else:
        dataset.at[index, 'anime_id'] = anime_id_to_new_id.get(row['anime_id'])


CPU times: user 7min 29s, sys: 114 ms, total: 7min 29s
Wall time: 7min 29s


In [14]:
%%time
user_id_to_new_id = dict()
id = 1
for index, row in dataset.iterrows():
    if user_id_to_new_id.get(row['user_id']) is None:
        user_id_to_new_id[row['user_id']] = id
        dataset.at[index, 'user_id'] = id
        id += 1
    else:
        dataset.at[index, 'user_id'] = user_id_to_new_id.get(row['user_id'])


CPU times: user 7min, sys: 78.4 ms, total: 7min
Wall time: 7min


In [122]:
num_users = len(dataset.user_id.unique())
num_animes = len(dataset.anime_id.unique())
train, test = train_test_split(dataset, test_size=0.2)

print('Number of movies', num_animes)
print('Number of users', num_users)

Number of movies 16082
Number of users 177913


In [123]:
print('train shape: ', train.shape)
print('test shape: ', test.shape)

train shape:  (4539052, 3)
test shape:  (1134764, 3)


In [153]:
dataset.dtypes

user_id        int64
anime_id       int64
rating       float64
mean_pred    float64
dtype: object

## Baseline on mean

In [146]:
dataset.rating.mean()

0.7374688216889657

In [147]:
y_base = dataset.rating.mean()

In [148]:
dataset['mean_pred'] = dataset['rating']

In [149]:
dataset['mean_pred'] = y_base

In [150]:
y_true = dataset.rating
y_true.shape
y_mean_pred = dataset.mean_pred

In [151]:
mean_absolute_error(y_true,y_mean_pred)

0.13761398921096596

In [152]:
mean_squared_error(y_true,y_mean_pred)

0.031107109965243282

In [115]:
dataset.drop(columns='mean_pred', inplace=True)

In [116]:
dataset

Unnamed: 0,user_id,anime_id,rating,norm_rating
9980816,1,1,3,0.3
3744929,2,2,5,0.5
54195347,3,3,7,0.7
21400486,4,4,7,0.7
13647532,5,5,8,0.8
...,...,...,...,...
9538738,11362,589,8,0.8
19745972,7815,1371,8,0.8
8665987,31957,7382,7,0.7
7066940,149216,1354,6,0.6


# Multi-layer Perceptron

In [125]:
from keras.layers import  BatchNormalization

latent_dim = 2

# Define inputs
anime_input = Input(shape=[1],name='anime-input')
user_input = Input(shape=[1], name='user-input')

# MLP Embeddings
anime_embedding_mlp = Embedding(num_animes + 1, latent_dim, name='anime-embedding-mlp')(anime_input)
anime_vec_mlp = Flatten(name='flatten-anime-mlp')(anime_embedding_mlp)

user_embedding_mlp = Embedding(num_users + 1, latent_dim, name='user-embedding-mlp')(user_input)
user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

# MF Embeddings
anime_embedding_mf = Embedding(num_animes + 1, latent_dim, name='anime-embedding-mf')(anime_input)
anime_vec_mf = Flatten(name='flatten-anime-mf')(anime_embedding_mf)

user_embedding_mf = Embedding(num_users + 1, latent_dim, name='user-embedding-mf')(user_input)
user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)


In [137]:
# MLP layers
concat = concatenate([anime_vec_mlp, user_vec_mlp], axis=1, name='concat')
concat_dropout = Dropout(0.2)(concat)
fc_1 = Dense(50, name='fc-1', activation='relu')(concat_dropout)
fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
fc_1_dropout = Dropout(0.2)(fc_1_bn)
fc_2 = Dense(20, name='fc-2', activation='relu')(fc_1_dropout)
fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
fc_2_dropout = Dropout(0.2)(fc_2_bn)

# Prediction from both layers
pred_mlp = Dense(10, name='pred-mlp', activation='relu')(fc_2_dropout)
pred_mf = dot([anime_vec_mf, user_vec_mf], axes=1, normalize=False, name='pred-mf')
combine_mlp_mf = concatenate([pred_mf, pred_mlp], axis=1, name='combine-mlp-mf')

# Final prediction
result = Dense(1, name='result', activation='linear')(combine_mlp_mf)

model = Model([user_input, anime_input], result)
model.compile(Adam(learning_rate=0.1), loss='mae',metrics=['mae'])

In [138]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=2, restore_best_weights=True)

In [139]:
history = model.fit([train.user_id, train.anime_id], train.rating, epochs=4,validation_split=0.3,callbacks=[es])

Epoch 1/4
  747/99292 [..............................] - ETA: 21:44 - loss: 0.1797 - mae: 0.1797

KeyboardInterrupt: 

In [129]:
pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Train Error")
plt.show()

y_hat = np.round(model.predict([test.user_id, test.anime_id]), decimals=2)
y_true = test.rating
mean_absolute_error(y_true, y_hat)

NameError: name 'history' is not defined

In [130]:
def plot_loss_mae(history, title=None):
    fig, ax = plt.subplots(1,2, figsize=(13,5))
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('Model loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylim((0,3))
    ax[0].legend(['Train', 'Val'], loc='best')
    
    ax[1].plot(history.history['mae'])
    ax[1].plot(history.history['val_mae'])
    ax[1].set_title('Model mae')
    ax[1].set_ylabel('mae')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Val'], loc='best')
    ax[1].set_ylim((0,3))
    if title:
        fig.suptitle(title)

In [131]:
plot_loss_mae(history)

NameError: name 'history' is not defined

In [144]:
# MLP layers
concat = concatenate([anime_vec_mlp, user_vec_mlp], axis=1, name='concat')
concat_dropout = Dropout(0.2)(concat)
fc_1 = Dense(50, name='fc-1', activation='relu')(concat_dropout)
fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
fc_1_dropout = Dropout(0.2)(fc_1_bn)
fc_2 = Dense(20, name='fc-2', activation='relu')(fc_1_dropout)
fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
fc_2_dropout = Dropout(0.2)(fc_2_bn)

# Prediction from both layers
pred_mlp = Dense(10, name='pred-mlp', activation='relu')(fc_2_dropout)
pred_mf = dot([anime_vec_mf, user_vec_mf], axes=1, normalize=False, name='pred-mf')
combine_mlp_mf = concatenate([pred_mf, pred_mlp], axis=1, name='combine-mlp-mf')

# Final prediction
result = Dense(1, name='result', activation='linear')(combine_mlp_mf)

model = Model([user_input, anime_input], result)
model.compile(Adam(learning_rate=0.1), loss='mse',metrics=['mse'])



In [145]:
history = model.fit([train.user_id, train.anime_id], train.rating, epochs=4,validation_split=0.3,callbacks=[es])

Epoch 1/4
  591/99292 [..............................] - ETA: 19:54 - loss: 0.0672 - mse: 0.0672

KeyboardInterrupt: 

In [None]:
pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Train Error")
plt.show()

y_hat = np.round(model.predict([test.user_id, test.anime_id]), decimals=2)
y_true = test.rating
mean_absolute_error(y_true, y_hat)

In [None]:
history.__dict__

In [None]:
def plot_loss_mse(history, title=None):
    fig, ax = plt.subplots(1,2, figsize=(13,5))
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('Model loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylim((0,3))
    ax[0].legend(['Train', 'Val'], loc='best')
    
    ax[1].plot(history.history['mse'])
    ax[1].plot(history.history['val_mse'])
    ax[1].set_title('Model mse')
    ax[1].set_ylabel('mse')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Val'], loc='best')
    ax[1].set_ylim((0,3))
    if title:
        fig.suptitle(title)

In [None]:
plot_loss_mse(history)