In [1]:
from sklearn.preprocessing import LabelEncoder
import sklearn
import tensorflow as tf
from tensorflow import keras as k
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import json
import ast
import joblib

  from .autonotebook import tqdm as notebook_tqdm


Importing and organizing the raw data 

In [8]:
# import os
# import json
# data = list()
# for file in os.listdir("./data"):
##     if file != "results_20000-24861.json":
#     with open("./data/"+file, 'rb') as f:
#         print(f"Read {file}")
#         data_ = json.load(f)
#         data += data_
##     else:
##         continue

In [9]:
# records = [{"User": record.get('user_id'), 
#            "ISBN": record.get('book_title'),
#            "RatingOf5": record.get('stars'),
#            "Genres": record.get('genres')
#           } for record in data]
# df = pd.DataFrame(records)
# df.to_csv("USER_ISBN_RATING_GENRES.csv")

In [493]:
df = pd.read_csv("../../USER_ISBN_RATING_GENRES.csv")
df.drop(columns=["Unnamed: 0"], inplace = True)

In [494]:
print("Total numer of unique books: ", len(df["ISBN"].unique()))
print("Total numer of unique users: ", len(df["User"].unique()))
print("Total numer of reviews: ", len(df))
print("Total numer of positive reviews: ", len(df[df["RatingOf5"] > 3]))
print("Total numer of non-positive reviews: ", len(df[df["RatingOf5"] < 3]))
print("Total numer of moderate reviews: ", len(df[df["RatingOf5"] == 3]))
print("Total numer of  non-reviews: ", sum(df["RatingOf5"].isna()))

Total numer of unique books:  57198
Total numer of unique users:  706880
Total numer of reviews:  1733561
Total numer of positive reviews:  936796
Total numer of non-positive reviews:  201628
Total numer of moderate reviews:  372374
Total numer of  non-reviews:  222763


Although I'll use a TF stringlookup object to encode users, to keep users anonymous for the data exploration stage, I'll label encode them as IDs

In [495]:
le = LabelEncoder()
le.fit(df["User"])
transformed = le.transform(df["User"])
len(transformed), len(np.unique(transformed, return_counts=False))

(1733561, 706880)

In [496]:
joblib.dump(le, "ranker_fitted_label_encoder.pkl")

['ranker_fitted_label_encoder.pkl']

In [497]:
le = joblib.load("ranker_fitted_label_encoder.pkl")
transformed = le.transform(df["User"])

In [498]:
df["User"] = transformed
df["User"].head()

0    423114
1    241063
2    659623
3    557019
4     44930
Name: User, dtype: int32

In [499]:
df2 = df.copy()

**I'm going to reserve a random sample of 10 observations to test recommendations as observations that the model has not seen before when developing the production package.**

In [500]:
df.dropna(inplace=True)

In [501]:
sample = df.groupby("ISBN").count()[df.groupby("ISBN").count()["User"] > 5]["User"]. \
    sort_values(ascending=False).sample(20).index
loop = True
while True:
    new_sample=df[df["ISBN"].isin(sample)].sample(10)['ISBN']
    if len(new_sample.unique()) == 10:
        sample_to_keep = new_sample
        break

In [502]:
sample = df[df.index.isin(sample_to_keep.index)][['User', 'ISBN', 'RatingOf5']]
index_to_drop = sample.index
sample.to_csv('ranker_test.csv', index=False)
df.drop(index=index_to_drop, axis=0, inplace=True)

In [503]:
df[df.index.isin(index_to_drop)]

Unnamed: 0,User,ISBN,RatingOf5,Genres


In [504]:
pd.read_csv('ranker_test.csv')[['User', 'ISBN']]

Unnamed: 0,User,ISBN
0,320727,What Went Wrong at Enron: Everyone's Guide to ...
1,625088,Guerrilla Learning: How to Give Your Kids a Re...
2,618800,Eastern Sun Winter Moon: An Autobiographical O...
3,411288,Raspberry Crush
4,198921,Pooh: Just Be Nice AND NOT TOO ROUGH SPECIAL E...
5,258203,The Life and Adventures of Nicholas Nickleby V...
6,336651,Silver Flame
7,684707,Nothing That Meets the Eye: The Uncollected St...
8,597291,Das Tor ins Nichts Der Magier 2
9,404510,Berserk: Motiveless Random Massacres


In [62]:
df[["User", "ISBN", "RatingOf5", "Genres"]].to_csv('ranker_train_data.csv', index=False)

In [63]:
pd.read_csv('ranker_train_data.csv', nrows=5)[["User", "ISBN"]]

Unnamed: 0,User,ISBN
0,423114,Kiss Hollywood Goodbye
1,241063,Kiss Hollywood Goodbye
2,659623,Kiss Hollywood Goodbye
3,557019,Kiss Hollywood Goodbye
4,44930,Kiss Hollywood Goodbye


In [505]:
df.isna().sum()

User         0
ISBN         0
RatingOf5    0
Genres       0
dtype: int64

#### Reducing dimentionality

With ~ 700k users and 60k books, the embeddings for the two embedding "towers" that the recommender model is going to be based on will generately an extremely large model. Practically, I would like to demonstrate the end-to-end data science pipeline by being able to package my model and make it available publicly for my API to access as well as for those interested which is why reduce the recommendations to those users who given more than 2 review.

In [506]:
grouped_data = df.groupby("User").count()
grouped_data[grouped_data["ISBN"] > 2]['ISBN']

User
0         41
7          3
8          4
25         3
31         3
          ..
706863    12
706867     6
706870     6
706871     4
706879     4
Name: ISBN, Length: 115057, dtype: int64

In [507]:
idx_to_keep = grouped_data[grouped_data["ISBN"] > 2].index
set(idx_to_keep) == set(df[df["User"].isin(idx_to_keep)]["User"])

True

In [508]:
df = df[df.User.isin(idx_to_keep)]
df.sample(5)[['User', 'ISBN']]

Unnamed: 0,User,ISBN
234345,244283,Lovers Crossing Roscoe Brinker 1
129545,418961,Great Russian Short Stories
1267580,360211,Calendar: Humanity's Epic Struggle To Determin...
348615,39372,Moss Rose
601934,91590,The Lonely Ships: The Life and Death of the US...


At this point we have an index of user that have more than 2 reviews (of different books). Our new DataFrame object will only comprise those users and their respectively reviewed books

In [509]:
# also ensure that the grouping and querying was sane
print("# of unique users remaining: ", len(df.groupby('User').count()))

# also check out how many unique books were lost in the process of trimming 
# down the dataset (only about 500)

print("# of unique books remaining: ", len(df.groupby('ISBN').count()))

    

# of unique users remaining:  115057
# of unique books remaining:  54466


In [510]:
# we can label encode the book ID's if required, otherwise, a TF stringlookup object can handle 
# a variety of string encodings well - as will follow

le_isbn = LabelEncoder()
df["ISBN"] = le_isbn.fit_transform(df["ISBN"])

In [511]:
df.User = df.User.astype(np.int64)
df.ISBN = df.ISBN.astype(np.int64)
df.RatingOf5 = df.RatingOf5.astype(np.int64)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924369 entries, 0 to 1733560
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   User       924369 non-null  int64 
 1   ISBN       924369 non-null  int64 
 2   RatingOf5  924369 non-null  int64 
 3   Genres     924369 non-null  object
dtypes: int64(3), object(1)
memory usage: 35.3+ MB


Setting up tensors to batch for purposes of training the embeddings 

In [512]:
books = tf.data.Dataset.from_tensor_slices(df['ISBN'].astype('str').values)
ratings = tf.data.Dataset.from_tensor_slices(df[['User', 'ISBN', 'RatingOf5']].astype('str').values)

In [513]:
train_size = int(len(df)*0.8)
train = tf.data.Dataset.from_tensor_slices(
    df[["ISBN", "User", "RatingOf5"]][:train_size].astype('str').values).shuffle(100000)
test = tf.data.Dataset.from_tensor_slices(
    df[["ISBN", "User", "RatingOf5"]][train_size:].astype('str').values).shuffle(10000)

In [514]:
for i in test.take(1):
    print(i)
    break

tf.Tensor([b'33888' b'75806' b'3'], shape=(3,), dtype=string)


String look up creates a vocabulary dictionary of string-value pairs since I'll be embedding my ISBN's and users ID's

In [82]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x[0]))

book_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
book_titles_vocabulary.adapt(ratings.map(lambda x: x[1]))

In [83]:
np.save("ranker_user_ids_vocabulary", user_ids_vocabulary.get_weights())
np.save("ranker_book_titles_vocabulary", book_titles_vocabulary.get_weights())

In [516]:
new_user_id_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_user_id_stringlookup.set_weights(np.load("ranker_user_ids_vocabulary.npy", allow_pickle=True))

new_book_title_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_book_title_stringlookup.set_weights(np.load("ranker_book_titles_vocabulary.npy", allow_pickle=True))

In [594]:
# compare vocab against original data as a sanity check
print("Unique User ID counts: ", len(df["User"].unique()), new_user_id_stringlookup.vocabulary_size())
print("Unique ISBN counts: ", len(df["ISBN"].unique()), new_book_title_stringlookup.vocabulary_size())

Unique User ID counts:  115057 115059
Unique ISBN counts:  54466 54467


The difference in "vocab" counts results from the [UNK] (i.e. unknown) token. There is also a duplicate User in the string lookup for some reason that can be removed if required, otherwise, it shouldn't affect the lookup.

Several user_ids have a string lookup of 0 which stands for unknown meaning they were likely dropped during the attemp at dimentionality reduction (i.e. these users had only ever reviewed one/two books)

In [518]:
idx = [0, 3, 8]

# # this is a bit of  confusing one-liner that simply attempts to search for the indices of users that 
# # had the string lookup assign them a value of 0 and thus are the suspicions confirmed that
# # they've been dropped during the dimentionality reduction step since they're users with less than 
# # 2 total reviews.

assert len(df[df.index.isin(sample[sample['User'].reset_index().index.isin(idx)].index)]['User'].unique()) > 0

AssertionError: 



I'll be doing tests for label encoders and string lookups on the books columns and not the users columns before deployment 
since their info is confidential, otherwise, for testing purposes, the sample taken isn't valid and would fail the tests

#### Modelling

The following class enherits from the TF Model class and sets up the embedding layers for users and books that will fed into a DNN with a resultant rank. The goal is to reduce the MSE between this rank and actual user rating for a book.

In [634]:
class RankingModel(tf.keras.Model):

    def __init__(self, book_title_stringlookup, user_id_stringlookup):
        super().__init__()
        embedding_dimension = 32

        # Compute embeddings for users.
        self.user_embeddings = tf.keras.Sequential([user_id_stringlookup,
          tf.keras.layers.Embedding(user_id_stringlookup.vocabulary_size()+1, embedding_dimension)
        ])

        # Compute embeddings for books.
        self.book_embeddings = tf.keras.Sequential([book_title_stringlookup,
          tf.keras.layers.Embedding(book_title_stringlookup.vocabulary_size()+1, embedding_dimension)
        ])
        
        # Compute predictions.
        self.ratings = tf.keras.Sequential([
          tf.keras.layers.Dense(32, activation="relu"),
          tf.keras.layers.Dense(16, activation="relu"),
          tf.keras.layers.Dense(8, activation="relu"),
          tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):

        user_id, book_title = inputs

        user_embedding = self.user_embeddings(user_id)
        book_embedding = self.book_embeddings(book_title)

        return self.ratings(tf.concat([user_embedding, book_embedding], axis=1))

In [643]:
class BookRecommenderModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel(new_book_title_stringlookup, new_user_id_stringlookup)
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
          loss = tf.keras.losses.MeanSquaredError(),
          metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features: dict) -> tf.Tensor:
        return self.ranking_model(
            (features[:, 1], features[:, 0]))

    def compute_loss(self, features: dict, training=False) -> tf.Tensor:

        labels =  tf.strings.to_number(features[:,2])

        rating_predictions = self(features[:, :2])

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)

In [644]:
model = BookRecommenderModel()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1))

In [645]:
for i in train.take(1):
    print('book {}, user_id {}, rating {}'.format(i[0], i[1], i[2]))
    break

book b'14361', user_id b'704826', rating b'5'


In [646]:
for i in test.take(1):
    print('book {}, user_id {}, rating {}'.format(i[0], i[1], i[2]))
    break

book b'2044', user_id b'609952', rating b'5'


In [647]:
cached_train = train.batch(8000).cache()
cached_test = test.batch(4000).cache()

In [648]:
for i in cached_train.take(1):
    print(i[0])
    break
    

tf.Tensor([b'42803' b'545379' b'1'], shape=(3,), dtype=string)


In [649]:
for i in cached_train.take(1):
    print(i[:,0])

tf.Tensor([b'20988' b'46890' b'35762' ... b'24217' b'7349' b'37794'], shape=(8000,), dtype=string)


In [650]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1db22930580>

In [651]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 1.005478024482727,
 'loss': 0.5586927533149719,
 'regularization_loss': 0,
 'total_loss': 0.5586927533149719}

In [595]:
user = int(df2.User.sample(1))
samples = df.sample(20)[['ISBN']].values
samples = np.concatenate((np.array([[user]*20]), samples.reshape(1,-1)), axis=0).T.astype(np.str_)
# samples

In [695]:
for i,j in zip(samples[:, 1], # samples ISBNs
               model(tf.convert_to_tensor(samples, dtype=tf.string)) # calling the model in eval mode on sample ISBNs 
              ): 
    print( "Book title: {} | Rating: {:.3f}".format(
        le_isbn.inverse_transform(
            np.array(new_book_title_stringlookup.call(tf.constant(i))).reshape(-1))[0], # retrieving book title using stringlook up
        j[0])) # rating

Book title: Gathering Storm: America's Militia Threat | Rating: 2.816
Book title: A Very Young Dancer | Rating: 2.816
Book title: Cat Fear Street 45 | Rating: 3.170
Book title: The Man Who Stole the Mona Lisa | Rating: 2.816
Book title: Girl in Hyacinth Blue | Rating: 2.816
Book title: Isaac Asimov's Werewolves | Rating: 3.961
Book title: Cliges | Rating: 2.816
Book title: How To Control Your Anxiety Before It Controls You | Rating: 2.816
Book title: The Bookshop | Rating: 2.816
Book title: Bones of the Earth | Rating: 2.816
Book title: Der Lavendelgarten | Rating: 2.816
Book title: Starman Jones | Rating: 2.816
Book title: Baby wann heiratest du mich Ein Roman aus dem Beziehungsdschungel | Rating: 2.816
Book title: The Seventh Plague Sigma Force 12 | Rating: 2.816
Book title: Back to the Stone Age Pellucidar 5 | Rating: 2.519
Book title: Children of the Ruins | Rating: 2.816
Book title: Delaney's Desert Sheikh The Westmorelands 1 | Rating: 2.816
Book title: Crime School Kathleen Mallo

Given the relative sparsity of data, most books will have similar scores, with a fewer number having unique scores. To tackle this problem one can create richer ranking models using richer features, such as the user's favourite genres and book genres.

In [106]:
# first genre in the list indicates the dominant genre of the book
# for example: ISBN 46246 is generally mystery, then fiction, then more specifically, a spy thriller
df.groupby('ISBN', as_index=False).first()[['ISBN', 'Genres']].sample(5)

Unnamed: 0,ISBN,Genres
46330,46330,"['Fiction', 'Poetry', 'Short Stories', 'Sports..."
30068,30068,"['Science Fiction', 'Fiction', 'Science Fictio..."
49574,49574,"['Nonfiction', 'Self Help', 'Psychology', 'Neu..."
36904,36904,"['Fiction', 'Novels']"
20708,20708,"['Fantasy', 'Science Fiction', 'Fiction', 'Sci..."


In [107]:
df2 = df.copy()

In [109]:
import ast
from collections import defaultdict, Counter

user_genres_mapping = defaultdict()
for user in df2.User.unique():
    records = []
    for record in df[df2.User == user].Genres:
        records += ast.literal_eval(record)
    for val, col_name in zip(Counter(records).most_common(4), ['User_First_Cat', 
                                                               'User_Second_Cat',
                                                               'User_Third_Cat',
                                                               'User_Fourth_Cat']):
        for element in df2[df2.User==user].index:
            df2.at[element,col_name] = val[0]

In [114]:
df2.sample(5).iloc[:, 4:8]

Unnamed: 0,User_First_Cat,User_Second_Cat,User_Third_Cat,User_Fourth_Cat
1405400,Sequential Art,Comics,Graphic Novels,Graphic Novels Comics
1370968,Historical,Fantasy,Historical Fiction,Fiction
1038182,Literature,Historical,Fiction,Classics
1226401,Mystery,Historical,Fiction,Historical Fiction
685159,Romance,Mystery,Womens Fiction,Fiction


In [111]:
df2.to_csv('TrainDataWithUserGenres.csv', index=False)

In [112]:
for book in df2.ISBN.unique():
    genres = ast.literal_eval([record for record in df2[df2.ISBN==book]['Genres'][:1]][0])
    for genre, col_name in zip(genres[:4], ['Book_First_Cat', 
                               'Book_Second_Cat',
                               'Book_Third_Cat',
                               'Book_Fourth_Cat']):
        if genre:
            for element in df2[df2.ISBN==book].index:
                df2.at[element,col_name] = genre

df2.fillna('None', inplace=True)

In [115]:
df2.sample(5).iloc[:, 8:]

Unnamed: 0,Book_First_Cat,Book_Second_Cat,Book_Third_Cat,Book_Fourth_Cat
1481061,Mystery,Fiction,Short Stories,Mystery
1195313,Fantasy,Young Adult,Mystery,Fiction
26353,Fiction,Classics,Cultural,France
222866,Romance,Romance,Historical Romance,Historical
248337,Childrens,Childrens,Picture Books,Animals


In [116]:
df2.drop(columns='Genres', inplace=True)
df2.columns

Index(['User', 'ISBN', 'RatingOf5', 'User_First_Cat', 'User_Second_Cat',
       'User_Third_Cat', 'User_Fourth_Cat', 'Book_First_Cat',
       'Book_Second_Cat', 'Book_Third_Cat', 'Book_Fourth_Cat'],
      dtype='object')

In [117]:
user_ratings = df2.pop('RatingOf5')
df2.insert(0, 'RatingOf5', user_ratings)

In [118]:
df2.columns

Index(['RatingOf5', 'User', 'ISBN', 'User_First_Cat', 'User_Second_Cat',
       'User_Third_Cat', 'User_Fourth_Cat', 'Book_First_Cat',
       'Book_Second_Cat', 'Book_Third_Cat', 'Book_Fourth_Cat'],
      dtype='object')

In [129]:
df2.to_csv('Final_Ranker_Data.csv', index=False)

In [571]:
df2 = pd.read_csv('Final_Ranker_Data.csv')

In [572]:
ratings2 = tf.data.Dataset.from_tensor_slices(df2.astype('str').values)

In [573]:
train_size = int(len(df2)*0.8)
train2 = tf.data.Dataset.from_tensor_slices(
    df2[df2.columns][:train_size].astype('str').values).shuffle(100000)

test2 = tf.data.Dataset.from_tensor_slices(
    df2[df2.columns][train_size:].astype('str').values).shuffle(10000)

In [574]:
for i in train2.take(1):
    print(i[1:]) # features, excluding label
    break

tf.Tensor(
[b'153225' b'31111' b'Fiction' b'Gothic' b'Short Stories' b'Mystery'
 b'Gothic' b'Fiction' b'None' b'None'], shape=(10,), dtype=string)


In [575]:
unique_genres = []
for i in df2.columns[3:]:
    unique_genres += list(df2[f'{i}'].unique() )
unique_genres = np.unique(unique_genres.copy())

In [124]:
genre_lookup = tf.keras.layers.StringLookup(mask_token=None)
genre_lookup.adapt(unique_genres)s

In [125]:
genre_lookup.vocabulary_size()

732

In [126]:
for column in df2.columns[3:]:
    for i in df2.sample(1000)[column]: # samples 1000 rows from each columns to ensure the stringlookup isn't missing 
                                       # any categories
        genre_lookup.call(tf.constant(i))
#         print(i, genre_lookup.call(tf.constant(i)))

In [127]:
np.save("ranker_genre_lookup", genre_lookup.get_weights())

In [576]:
new_user_id_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_user_id_stringlookup.set_weights(np.load("ranker_user_ids_vocabulary.npy", allow_pickle=True))

new_book_title_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_book_title_stringlookup.set_weights(np.load("ranker_book_titles_vocabulary.npy", allow_pickle=True))

genre_lookup = tf.keras.layers.StringLookup(mask_token=None)
genre_lookup.set_weights(np.load("ranker_genre_lookup.npy", allow_pickle=True))

In [577]:
# compare vocab against original data as a sanity check; the incremental value contains the 'UNK' token
print("Unique User ID counts: ", len(df2["User"].unique()), new_user_id_stringlookup.vocabulary_size())
print("Unique ISBN counts: ", len(df2["ISBN"].unique()), new_book_title_stringlookup.vocabulary_size())
print("Unique genre counts: ", len(unique_genres), genre_lookup.vocabulary_size())

Unique User ID counts:  115058 115059
Unique ISBN counts:  54466 54467
Unique genre counts:  731 732


In [578]:
class RankingModel_(tf.keras.Model):

    def __init__(self, book_title_stringlookup, user_id_stringlookup, genre_lookup):
        super().__init__()
        embedding_dimension = 32
        genre_emb_dim = 4

        # Compute embeddings for users.
        self.user_embeddings = tf.keras.Sequential([user_id_stringlookup,
          tf.keras.layers.Embedding(user_id_stringlookup.vocabulary_size()+1, embedding_dimension)
        ])

        # Compute embeddings for books.
        self.book_embeddings = tf.keras.Sequential([book_title_stringlookup,
          tf.keras.layers.Embedding(book_title_stringlookup.vocabulary_size()+1, embedding_dimension)
        ])
        
        self.user_genre_1_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.user_genre_2_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.user_genre_3_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.user_genre_4_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.book_genre_1_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.book_genre_2_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.book_genre_3_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        self.book_genre_4_embeddings = tf.keras.Sequential([genre_lookup,
          tf.keras.layers.Embedding(genre_lookup.vocabulary_size()+1, genre_emb_dim)
        ])
        
        
        # Compute predictions.
        self.ratings = tf.keras.Sequential([
          tf.keras.layers.Dense(32, activation="relu"),
          tf.keras.layers.Dense(16, activation="relu"),
          tf.keras.layers.Dense(8, activation="relu"),
          tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):

        user_id, book_title, user_genre_cat_1, user_genre_cat_2, user_genre_cat_3, user_genre_cat_4, \
        book_genre_cat_1, book_genre_cat_2, book_genre_cat_3, book_genre_cat_4 = inputs

        user_embedding = self.user_embeddings(user_id)
        book_embedding = self.book_embeddings(book_title)
        user_genre_1_embeddings = self.user_genre_1_embeddings(user_genre_cat_1)
        user_genre_2_embeddings = self.user_genre_2_embeddings(user_genre_cat_2)
        user_genre_3_embeddings = self.user_genre_3_embeddings(user_genre_cat_3)
        user_genre_4_embeddings = self.user_genre_4_embeddings(user_genre_cat_4)
        book_genre_1_embeddings = self.book_genre_1_embeddings(book_genre_cat_1)
        book_genre_2_embeddings = self.book_genre_2_embeddings(book_genre_cat_2)
        book_genre_3_embeddings = self.book_genre_3_embeddings(book_genre_cat_3)
        book_genre_4_embeddings = self.book_genre_4_embeddings(book_genre_cat_4)

        return self.ratings(tf.concat([user_embedding, book_embedding, user_genre_1_embeddings, 
                                       user_genre_2_embeddings, user_genre_3_embeddings, user_genre_4_embeddings,
                                      book_genre_1_embeddings, book_genre_2_embeddings, 
                                      book_genre_3_embeddings, book_genre_4_embeddings], axis=1))

In [579]:
class BookRecommenderModel_(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel_(new_book_title_stringlookup, 
                                                          new_user_id_stringlookup, 
                                                          genre_lookup)
        
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
          loss = tf.keras.losses.MeanSquaredError(),
          metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features) -> tf.Tensor:       
        temp = tuple()
        for i in range(0,10):        
            temp += (features[:, i],)
        return self.ranking_model(temp)

    def compute_loss(self, features, training=False) -> tf.Tensor:

        labels =  tf.strings.to_number(features[:,0])
        rating_predictions = self(features[:, 1:])

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)

In [580]:
model2 = BookRecommenderModel_()
model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1))

In [581]:
cached_train2 = train2.batch(8000).cache()
cached_test2 = test2.batch(4000).cache()

In [582]:
model2.fit(cached_train2, epochs=2, verbose=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1dae12074f0>

In [584]:
model2.evaluate(cached_test2)



[0.9679808616638184, 0.5815986394882202, 0, 0.5815986394882202]

In [585]:
df2.columns

Index(['RatingOf5', 'User', 'ISBN', 'User_First_Cat', 'User_Second_Cat',
       'User_Third_Cat', 'User_Fourth_Cat', 'Book_First_Cat',
       'Book_Second_Cat', 'Book_Third_Cat', 'Book_Fourth_Cat'],
      dtype='object')

In [683]:
# user = int(df2.User.sample(1))
print(f"User: {user}")

genres = df2[df2.User==user].iloc[0, 3:7].to_list()
test_sample = {'RatingOf5': None,
               'User': user,
               'ISBN': None}

for i, genre in enumerate(genres):
    test_sample[df2.columns[i+3]] = genre
for i in range(4):
    test_sample[df2.columns[i+7]] = None
    
test_sample = pd.DataFrame(data=test_sample, index=range(20))

book_samples = df2.sample(20)    
for i in range(11):
    curr_col = df2.columns[i]
    if i == 2 or i > 6:
        test_sample[curr_col] = book_samples[curr_col].values

User: 358987


In [690]:
rankings = list()
for i,j in zip(test_sample.values[:, 2], # samples ISBNs
               model2(tf.convert_to_tensor(test_sample.values.astype(np.str_), dtype=tf.string)) # calling the model in eval mode on sample ISBNs 
              ):
    book_title = le_isbn.inverse_transform(
            np.array(new_book_title_stringlookup.call(tf.constant(str(i)))).reshape(-1))[0]
    rankings.append((j[0].numpy(), # book title
                    book_title) # predicted book ranking
                   )

for i in sorted(rankings)[::-1]:
    print(i[1], i[0])
    

A Cafe on the Nile 3.8767078
Chinhominey's Secret 3.6498048
365 Meditations for Teachers 3.6498046
A Little Zit On The Side 3.647118
A Stranger in the Mirror 3.610251
2024: A Graphic Novel 3.5916674
Forensic Science: An Introduction to Criminalistics 3.581554
A Tour of the Calculus 3.581554
Hollywood Wives  The New Generation Hollywood Series 4 3.5593615
The Happiest Baby on the Block: The New Way to Calm Crying and Help Your Newborn Baby Sleep Longer 3.5385542
Love and Friendship 3.5350723
A Bitter Peace 3.520727
My Indecision Is Final: The Spectacular Rise and Fall of Goldcrest Films the Independent Studio That Challenged Hollywood 3.5139375
The Absent City 3.5108807
Doom of the Darksword The Darksword Trilogy 2 3.4930618
Clockers 3.457056
The End of Tragedy 3.4450378
The Enchanted Wood The Faraway Tree 1 3.4417443
Sextopia 3.40513
Noninterference 3.3981323


In [691]:
test_sample.head(2)

Unnamed: 0,RatingOf5,User,ISBN,User_First_Cat,User_Second_Cat,User_Third_Cat,User_Fourth_Cat,Book_First_Cat,Book_Second_Cat,Book_Third_Cat,Book_Fourth_Cat
0,,358987,43405,Historical,Fiction,Christian,Nonfiction,Historical,Historical Fiction,Classics,Childrens
1,,358987,14061,Historical,Fiction,Christian,Nonfiction,Horror,Classics,Mystery,Fiction


In [692]:
test_sample.RatingOf5 = '0' # reassining RatingOf5 to 0 since the initial model takes in string literals

In [693]:
ratings = test_sample.pop('RatingOf5')
books = test_sample.pop('ISBN')
test_sample.insert(1, 'ISBN', books)
test_sample.insert(2, 'RatingOf5', ratings)
print(test_sample.head(0))
test_sample = test_sample.values.astype(np.str_)

Empty DataFrame
Columns: [User, ISBN, RatingOf5, User_First_Cat, User_Second_Cat, User_Third_Cat, User_Fourth_Cat, Book_First_Cat, Book_Second_Cat, Book_Third_Cat, Book_Fourth_Cat]
Index: []


In [694]:
# Comparing the original model with the same sample and user

for i,j in zip(test_sample[:, 1], # samples ISBNs
               model(tf.convert_to_tensor(test_sample[:, :3], dtype=tf.string)) # calling the model in eval mode on sample ISBNs 
              ): 
    print( "Book title: {} | Rating: {:.3f}".format(
        le_isbn.inverse_transform(
            np.array(new_book_title_stringlookup.call(tf.constant(i))).reshape(-1))[0], # retrieving book title using stringlook up
        j[0])) # rating

Book title: A Little Zit On The Side | Rating: 2.816
Book title: The Happiest Baby on the Block: The New Way to Calm Crying and Help Your Newborn Baby Sleep Longer | Rating: 2.909
Book title: Doom of the Darksword The Darksword Trilogy 2 | Rating: 2.816
Book title: A Stranger in the Mirror | Rating: 2.816
Book title: The End of Tragedy | Rating: 3.638
Book title: Chinhominey's Secret | Rating: 2.816
Book title: 2024: A Graphic Novel | Rating: 2.816
Book title: A Cafe on the Nile | Rating: 2.816
Book title: Clockers | Rating: 2.816
Book title: My Indecision Is Final: The Spectacular Rise and Fall of Goldcrest Films the Independent Studio That Challenged Hollywood | Rating: 2.816
Book title: Noninterference | Rating: 2.816
Book title: The Enchanted Wood The Faraway Tree 1 | Rating: 2.816
Book title: A Bitter Peace | Rating: 2.816
Book title: Love and Friendship | Rating: 2.715
Book title: The Absent City | Rating: 2.816
Book title: Hollywood Wives  The New Generation Hollywood Series 4 |

Egineering extra features did provide a richer representation of users and books and thus rankings for the target user with exactly the number of distinct ranks as te number of book samples. The original model still fails to provide a unique rank for the majority of observations except 4-5 unique scores.