In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation,BatchNormalization,Input,Embedding,Dot,Dense,Flatten
from tensorflow.keras.callbacks import ModelCheckpoint,LearningRateScheduler,TensorBoard,EarlyStopping

from wordcloud import WordCloud
%matplotlib inline     

#### READING ANIMELIST.CSV

In [4]:
import os

In [6]:
INPUT_DIR = os.path.join("..","artifacts","raw")     # Double full stop means, first fo to the root directory and then to the artifacts directory and then RAW

In [7]:
rating_df = pd.read_csv(INPUT_DIR + "/animelist.csv", low_memory=True,usecols=["user_id","anime_id","rating"])

In [8]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [9]:
len(rating_df)

5000000

#### DATA PROCESSING

In [14]:
# We will focus on our active or frequent user because these are the users which will help us to predict ahead.

n_ratings = rating_df["user_id"].value_counts()
rating_df = rating_df[rating_df["user_id"].isin(n_ratings[n_ratings >= 400].index)].copy()

In [15]:
len(rating_df)

3246641

In [32]:
min_rating = np.min(rating_df["rating"])

In [33]:
min_rating

np.float64(0.0)

In [34]:
max_rating = rating_df["rating"].max()

In [35]:
max_rating

np.float64(1.0)

In [30]:
avg_rating = rating_df["rating"].mean()

In [31]:
avg_rating

np.float64(0.4122732695114729)

In [29]:
# Feature scaling: MinMax

rating_df["rating"] = rating_df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating))

In [37]:
rating_df.duplicated().sum()

np.int64(0)

In [40]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [46]:
user_ids = rating_df["user_id"].unique().tolist()   # Here unique means, make a list of all those IDs which are present in the columns without repetition
user2user_encoded = { x:i for i,x in enumerate(user_ids) }   #Here, we are encoding the user id, so that model understand in a better way
user2user_decoded = { i:x for i,x in enumerate(user_ids) } 
rating_df["user"] = rating_df["user_id"].map(user2user_encoded)


In [None]:
user2user_encoded     #These are the mappings of user id to user encoded id

{2: 0,
 6: 1,
 12: 2,
 16: 3,
 17: 4,
 19: 5,
 21: 6,
 41: 7,
 42: 8,
 44: 9,
 47: 10,
 53: 11,
 55: 12,
 60: 13,
 66: 14,
 73: 15,
 74: 16,
 85: 17,
 89: 18,
 90: 19,
 94: 20,
 98: 21,
 102: 22,
 108: 23,
 111: 24,
 112: 25,
 120: 26,
 121: 27,
 122: 28,
 135: 29,
 145: 30,
 146: 31,
 147: 32,
 153: 33,
 155: 34,
 156: 35,
 172: 36,
 174: 37,
 184: 38,
 190: 39,
 193: 40,
 194: 41,
 198: 42,
 204: 43,
 205: 44,
 209: 45,
 214: 46,
 219: 47,
 222: 48,
 227: 49,
 228: 50,
 235: 51,
 238: 52,
 240: 53,
 243: 54,
 248: 55,
 251: 56,
 252: 57,
 257: 58,
 264: 59,
 267: 60,
 272: 61,
 274: 62,
 275: 63,
 284: 64,
 285: 65,
 286: 66,
 290: 67,
 291: 68,
 293: 69,
 300: 70,
 301: 71,
 306: 72,
 308: 73,
 310: 74,
 313: 75,
 314: 76,
 316: 77,
 320: 78,
 321: 79,
 324: 80,
 325: 81,
 326: 82,
 327: 83,
 330: 84,
 336: 85,
 340: 86,
 345: 87,
 346: 88,
 349: 89,
 350: 90,
 366: 91,
 367: 92,
 371: 93,
 372: 94,
 375: 95,
 381: 96,
 382: 97,
 386: 98,
 389: 99,
 398: 100,
 405: 101,
 406: 102,
 

In [45]:
user2user_decoded

{0: 2,
 1: 6,
 2: 12,
 3: 16,
 4: 17,
 5: 19,
 6: 21,
 7: 41,
 8: 42,
 9: 44,
 10: 47,
 11: 53,
 12: 55,
 13: 60,
 14: 66,
 15: 73,
 16: 74,
 17: 85,
 18: 89,
 19: 90,
 20: 94,
 21: 98,
 22: 102,
 23: 108,
 24: 111,
 25: 112,
 26: 120,
 27: 121,
 28: 122,
 29: 135,
 30: 145,
 31: 146,
 32: 147,
 33: 153,
 34: 155,
 35: 156,
 36: 172,
 37: 174,
 38: 184,
 39: 190,
 40: 193,
 41: 194,
 42: 198,
 43: 204,
 44: 205,
 45: 209,
 46: 214,
 47: 219,
 48: 222,
 49: 227,
 50: 228,
 51: 235,
 52: 238,
 53: 240,
 54: 243,
 55: 248,
 56: 251,
 57: 252,
 58: 257,
 59: 264,
 60: 267,
 61: 272,
 62: 274,
 63: 275,
 64: 284,
 65: 285,
 66: 286,
 67: 290,
 68: 291,
 69: 293,
 70: 300,
 71: 301,
 72: 306,
 73: 308,
 74: 310,
 75: 313,
 76: 314,
 77: 316,
 78: 320,
 79: 321,
 80: 324,
 81: 325,
 82: 326,
 83: 327,
 84: 330,
 85: 336,
 86: 340,
 87: 345,
 88: 346,
 89: 349,
 90: 350,
 91: 366,
 92: 367,
 93: 371,
 94: 372,
 95: 375,
 96: 381,
 97: 382,
 98: 386,
 99: 389,
 100: 398,
 101: 405,
 102: 406,
 

In [47]:
n_users = len(user2user_encoded)

In [48]:
n_users

4203

In [49]:
anime_ids = rating_df["anime_id"].unique().tolist()   # Here unique means, make a list of all those IDs which are present in the columns without repetition
anime2anime_encoded = { x:i for i,x in enumerate(anime_ids) }   #Here, we are encoding the user id, so that model understand in a better way
anime2anime_decoded = { i:x for i,x in enumerate(anime_ids) } 
rating_df["anime"] = rating_df["anime_id"].map(anime2anime_encoded)


In [68]:
n_anime = len(anime2anime_encoded)

In [69]:
n_anime

17149

In [54]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
213,2,24833,0.0,0,0
214,2,235,1.0,0,1
215,2,36721,0.0,0,2
216,2,40956,0.0,0,3
217,2,31933,0.0,0,4


In [56]:
#Randomness/Shuffling of data
rating_df = rating_df.sample(frac=1, random_state=43).reset_index(drop=True)


In [57]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
0,5045,35778,0.5,1226,9179
1,1731,12031,0.0,446,1892
2,12454,12231,0.8,3155,4744
3,4108,33581,0.0,1011,2078
4,3284,10156,0.9,809,1098


In [60]:
X = rating_df[["user","anime"]].values
y = rating_df["rating"]

In [61]:
test_size = 1000
train_indices = rating_df.shape[0] - test_size

In [62]:
X_train, X_test, y_train, y_test = (
    X[:train_indices],
    X[train_indices:],
    y[:train_indices],
    y[train_indices:],  
)

In [63]:
len(X_train)

3245641

In [64]:
len(X_test)

1000

In [None]:
type(X_train)   # Check if your data is in numpy array before feeeding to model

numpy.ndarray

In [66]:
#Storing numpy array in a list

X_train_array = [X_train[:,0],X_train[:,1]]
X_test_array = [X_test[:,0],X_test[:,1]]

#### MODEL ARCHITECTURE

In [None]:
def RecommenderNet():
    embedding_size = 128

    user = Input(name="user", shape=[1])
    user_embedding = Embedding(name = "user_embedding", input_dim=n_users, output_dim=embedding_size)(user)

    anime = Input(name="anime", shape=[1])
    anime_embedding = Embedding(name = "anime_embedding", input_dim=n_anime, output_dim=embedding_size)(anime)

    x = Dot(name="dot_product", normalize=True, axes=2)([user_embedding, anime_embedding])  #Basically, a dot product will happen. Dot product will tell us the similarities between two input layers

    x = Flatten()(x)
    x = Dense(1,kernel_initializer="he_normal")(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)

    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss="binary_crossentropy", optimizer=Adam, metrics=["mae","mse","acuracy"])
    return model

