In [1]:
from sklearn.preprocessing import LabelEncoder
import sklearn
import tensorflow as tf
from tensorflow import keras as k
import tensorflow_recommenders as tfrs
import pandas as pd
import numpy as np
import json
import ast
import joblib

Importing and organizing the raw data 

In [8]:
# import os
# import json
# data = list()
# for file in os.listdir("./data"):
##     if file != "results_20000-24861.json":
#     with open("./data/"+file, 'rb') as f:
#         print(f"Read {file}")
#         data_ = json.load(f)
#         data += data_
##     else:
##         continue

In [9]:
# records = [{"User": record.get('user_id'), 
#            "ISBN": record.get('book_title'),
#            "RatingOf5": record.get('stars'),
#            "Genres": record.get('genres')
#           } for record in data]
# df = pd.DataFrame(records)
# df.to_csv("USER_ISBN_RATING_GENRES.csv")

In [53]:
df = pd.read_csv("USER_ISBN_RATING_GENRES.csv")
df.drop(columns=["Unnamed: 0"], inplace = True)

In [54]:
print("Total numer of unique books: ", len(df["ISBN"].unique()))
print("Total numer of unique users: ", len(df["User"].unique()))
print("Total numer of reviews: ", len(df))
print("Total numer of positive reviews: ", len(df[df["RatingOf5"] > 3]))
print("Total numer of non-positive reviews: ", len(df[df["RatingOf5"] < 3]))
print("Total numer of moderate reviews: ", len(df[df["RatingOf5"] == 3]))
print("Total numer of  non-reviews: ", sum(df["RatingOf5"].isna()))

Total numer of unique books:  57198
Total numer of unique users:  706880
Total numer of reviews:  1733561
Total numer of positive reviews:  936796
Total numer of non-positive reviews:  201628
Total numer of moderate reviews:  372374
Total numer of  non-reviews:  222763


To keep users anonymous, I'll label encode them as IDs

In [55]:
le = LabelEncoder()
le.fit(df["User"])
transformed = le.transform(df["User"])
len(transformed), len(np.unique(transformed, return_counts=False))

(1733561, 706880)

In [56]:
joblib.dump(le, "fitted_label_encoder.pkl")

['fitted_label_encoder.pkl']

In [57]:
le = joblib.load("fitted_label_encoder.pkl")
transformed = le.transform(df["User"])

In [58]:
df["User"] = transformed
df["User"].head()

0    423114
1    241063
2    659623
3    557019
4     44930
Name: User, dtype: int32

In [59]:
df2 = df.copy()

**I'm going to reserve a random sample of 10 observations to test recommendations as observations that the model has not seen before when developing the production package.**

In [60]:
sample = df.groupby("ISBN").count()[df.groupby("ISBN").count()["User"] > 5]["User"].sort_values(ascending=False).sample(20).index
loop = True
while True:
    new_sample=df[df["ISBN"].isin(sample)].sample(10)['ISBN']
    if len(new_sample.unique()) == 10:
        sample_to_keep = new_sample
        break

In [61]:
sample = df[df.index.isin(sample_to_keep.index)][['User', 'ISBN']]
index_to_drop = sample.index
sample.to_csv('test.csv', index=False)
df.drop(index=index_to_drop, axis=0, inplace=True)

In [62]:
df[df.index.isin(index_to_drop)]

Unnamed: 0,User,ISBN,RatingOf5,Genres


In [63]:
pd.read_csv('test.csv')

Unnamed: 0,User,ISBN
0,505923,Veggie Lovers Cook Book
1,458816,Candyland: A Novel in Two Parts
2,188134,Urban Horrors
3,678813,Trieste and The Meaning of Nowhere
4,237343,The Silver Stallion: A Comedy of Redemption
5,627567,In meinem Himmel
6,97299,Kipper's Game
7,30625,Amy Girl
8,312942,Mom Among the Liars Mom 4
9,408986,Jamberry


In [64]:
df[["User", "ISBN"]].to_csv('train_data.csv', index=False)

In [65]:
pd.read_csv('train_data.csv', nrows=5)

Unnamed: 0,User,ISBN
0,423114,Kiss Hollywood Goodbye
1,241063,Kiss Hollywood Goodbye
2,659623,Kiss Hollywood Goodbye
3,557019,Kiss Hollywood Goodbye
4,44930,Kiss Hollywood Goodbye


#### Reducing dimentionality

With ~ 700k users and 60k books, the embeddings for the two embedding "towers" that the recommender model is going to be based on will generately an extremely large model. Practically, I would like to demonstrate the end-to-end data science pipeline by being able to package my model and make it available publicly for my API to access as well as for those interested which is why reduce the recommendations to those users who given more than 2 review.

In [68]:
grouped_data = df.groupby("User").count()
grouped_data[grouped_data["ISBN"] > 2]['ISBN']

User
0         42
7          3
8          4
25         3
31         3
          ..
706863    12
706867     6
706870     6
706871     4
706879     4
Name: ISBN, Length: 130888, dtype: int64

In [69]:
len(df)

1733551

In [70]:
idx_to_keep = grouped_data[grouped_data["ISBN"] > 2].index
set(idx_to_keep) == set(df[df["User"].isin(idx_to_keep)]["User"])

True

At this point we have an index of user that have more than 2 reviews (of different books). Our new DataFrame object will only comprise those users and their respectively reviewed books

In [72]:
df = df[df.User.isin(idx_to_keep)]
df.sample(5)[['User', 'ISBN']]

Unnamed: 0,User,ISBN
732896,214798,The Ooze Ghosts of Fear Street 8
134638,184939,The Search Cooper's Corner 13
1701311,102597,Crossing Antarctica
1469864,124818,Rules of the Hunt
1716250,334105,Mrs Dalloway


In [73]:
# also ensure that the grouping and querying was sane
print("# of unique users remaining: ", len(df.groupby('User').count()))

# also check out how many unique books were lost in the process of trimming 
# down the dataset (only about 500)

print("# of unique books remaining: ", len(df.groupby('ISBN').count()))

    

# of unique users remaining:  130888
# of unique books remaining:  56669


In [74]:
# we can label encode the book ID's if we so wish
le_isbn = LabelEncoder()
df["ISBN"] = le_isbn.fit_transform(df["ISBN"])

Setting up tensors to batch for purposes of training the embeddings 

In [75]:
books = tf.data.Dataset.from_tensor_slices(
    df["ISBN"].astype('str').values).shuffle(128) # TODO: config (shuffle seed + size)

ratings = tf.data.Dataset.from_tensor_slices(
    df[["ISBN", "User"]].astype('str').values).shuffle(128) # TODO: config (shuffle seed + size)

In [76]:
for batch in ratings.batch(batch_size=10).take(1):
    break

# [[Book title, User ID],]
[[batch[i].numpy()[j].decode() for j in range(2)] for i in range(10)]

[['25879', '339182'],
 ['51897', '638932'],
 ['22378', '569243'],
 ['50409', '445633'],
 ['25879', '487228'],
 ['22378', '423114'],
 ['14058', '197397'],
 ['22378', '16953'],
 ['32867', '111877'],
 ['46288', '667545']]

String look up creates a vocabulary dictionary of string-value pairs since I'll be embedding my ISBN's and users ID's

In [87]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x[1]))

book_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
book_titles_vocabulary.adapt(books)

In [88]:
np.save("user_ids_vocabulary", user_ids_vocabulary.get_weights())
np.save("book_titles_vocabulary", book_titles_vocabulary.get_weights())

In [94]:
new_book_title_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_book_title_stringlookup.set_weights(np.load("book_titles_vocabulary.npy", allow_pickle=True))
new_user_id_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_user_id_stringlookup.set_weights(np.load("user_ids_vocabulary.npy", allow_pickle=True))

In [95]:
# compare vocab against original data as a sanity check
print("Unique User ID counts: ", len(df["User"].unique()), new_user_id_stringlookup.vocabulary_size())
print("Unique ISBN counts: ", len(df["ISBN"].unique()), new_book_title_stringlookup.vocabulary_size())

Unique User ID counts:  130888 130889
Unique ISBN counts:  56669 56670


The difference in "vocab" counts results from the [UNK] (i.e. unknown) token. 

Testing stringlookup on the saved sample:

In [96]:
len(new_book_title_stringlookup.get_vocabulary())

56670

In [99]:
new_book_title_stringlookup.call(tf.constant(sample['ISBN'].astype(str))), sample['ISBN']

(<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)>,
 59625                          Veggie Lovers Cook Book
 292819                 Candyland: A Novel in Two Parts
 356947                                   Urban Horrors
 481544              Trieste and The Meaning of Nowhere
 695979     The Silver Stallion: A Comedy of Redemption
 805528                                In meinem Himmel
 1152083                                  Kipper's Game
 1340451                                       Amy Girl
 1465684                      Mom Among the Liars Mom 4
 1527060                                       Jamberry
 Name: ISBN, dtype: object)

In [100]:
new_user_id_stringlookup.call(tf.constant(sample['User'].astype(str))), sample['User']

(<tf.Tensor: shape=(10,), dtype=int64, numpy=
 array([ 99278, 102845,    181,  45465,  28074,   2431,   6463, 114522,
           231,  19681], dtype=int64)>,
 59625      505923
 292819     458816
 356947     188134
 481544     678813
 695979     237343
 805528     627567
 1152083     97299
 1340451     30625
 1465684    312942
 1527060    408986
 Name: User, dtype: int32)

Several user_ids have a string lookup of 0 which stands for unknown meaning they were likely dropped during the attemp at dimentionality reduction (i.e. these users had only ever reviewed one/two book)

In [101]:
idx = [0, 3, 8]

# this is a bit of  confusing one-liner that simply attempts to search for the indices of users that 
# had the string lookup assign them a value of 0 and thus are the suspicions confirmed that
# they've been dropped during the dimentionality reduction step since they're users with less than 
# 2 total reviews.

assert len(df[df.index.isin(sample[sample['User'].reset_index().index.isin(idx)].index)]['User'].unique()) > 0

AssertionError: 



I'll be doing tests for label encoders and string lookups on the books columns and not the users columns before deployment 
since their info is confidential, otherwise, for testing purposes, the sample taken isn't valid and would fail the tests

#### Modelling

The following class enherits from the TensorFlow Recommenders class and sets up the embedding models for users and books that will be fit to reduce loss. This is also referred to as the two tower model since the embeddings are very long along one dimension (that of the users/books) and shorter on the other (the features dimension)

In [102]:
BATCH_SIZE = 1024*8
EPOCHS = 3

In [103]:
class TwoTowerModel(tfrs.Model):
    def __init__(
      self,
      user_model: tf.keras.Model,
      book_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
        super(TwoTowerModel, self).__init__()
    
        # embedding representations
        self.user_model = user_model
        self.book_model = book_model
        # this will receive the computed losses by using the retrieval class to calculate the FactorizedTopK
        self.task = task 
    
    def compute_loss(self, features, training=False) -> tf.Tensor:
        # Define how the loss is computed.

        user_embeddings = self.user_model(features[1])
        book_embeddings = self.book_model(features[0])

        return self.task(user_embeddings, book_embeddings, compute_metrics=False)

In [104]:
user_model = tf.keras.Sequential([
    new_user_id_stringlookup,
    tf.keras.layers.Embedding(new_user_id_stringlookup.vocabulary_size()+1 , 32)
])

book_model = tf.keras.Sequential([
    new_book_title_stringlookup,
    tf.keras.layers.Embedding(new_book_title_stringlookup.vocabulary_size()+1, 32)
])

task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    books.batch(256).map(book_model)
  )
)

In [105]:
tt_model = TwoTowerModel(user_model, book_model, task)
tt_model.compile(optimizer=tf.keras.optimizers.Adam(0.1))

In [106]:
tt_model.fit(ratings.batch(BATCH_SIZE), epochs=EPOCHS)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x21c91580a30>

The brute force approach goes through every index to get the top K recommendation. This is considered tedious and the alternative scalable nearest neighbours approach provides a viable alternative since the latter utilizes compressed vectors. 

In [107]:
indexer = tfrs.layers.factorized_top_k.BruteForce(tt_model.user_model, k = 100)
indexer.index_from_dataset(
    books.batch(64).map(lambda title: (title, tt_model.book_model(title))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x21c91583610>

Let's get some recommendations.

In [108]:
print(idx) # users that the string lookupand hence model doesn't know and should give then random recommendations
print(sample['User'].values)

[0, 3, 8]
[505923 458816 188134 678813 237343 627567  97299  30625 312942 408986]


In [109]:
%timeit _, titles = indexer(tf.constant(["703730"]))
_, titles = indexer(tf.constant(["703730"]))
set([titles.numpy()[0][i].decode() for i in range(len(titles.numpy()[0]))])

9.72 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


{'18684', '28325', '32678', '44154', '5252'}

In [110]:
_, titles = indexer(tf.constant(["3"]))
set([titles.numpy()[0][i].decode() for i in range(len(titles.numpy()[0]))])

{'52727', '53615'}

The time taken is quite slow for a live model which is why a ScaNN alogorithm can be used which approximates recommendations user a nearest-neighbours approach.

As a sanity check, I'll get a couple of unknow users' recommendations

In [111]:
assert ("683095" or "47561") not in new_user_id_stringlookup.get_vocabulary()

In [112]:
for user in ['683095', '47561 ']:
    _, titles = indexer(tf.constant([user]))
    print(user, ": ", set([titles.numpy()[0][i].decode() for i in range(len(titles.numpy()[0]))]))

683095 :  {'52727', '53615'}
47561  :  {'52727', '53615'}


The similarity in recommendations to unknown indicates that all unknown users are given the same random recommendation.

**Making sense of the recommendations requires analyzing the genres of book user "100023" read and liked.**

In [113]:
read_genres = set()
user_read_genres = df2[df2["User"]==55862]["Genres"].unique()
user_read_genres = [ast.literal_eval(user_read_genres[i]) for i in range(len(user_read_genres))];
[[read_genres.add(i) for i in element] for element in user_read_genres];
list(read_genres)

['Gothic',
 'Dogs',
 'Adventure',
 'Suspense',
 'Young Adult',
 'Womens',
 'Banned Books',
 'How To',
 'Nonfiction',
 'Crime',
 'Mystery Thriller',
 'Family',
 'Memoir',
 'Short Story Collection',
 'Womens Fiction',
 'Ireland',
 'Poetry',
 'Philosophy',
 'Adult',
 'Health',
 'Outdoors',
 'The United States Of America',
 'European Literature',
 '20th Century',
 'Supernatural',
 'Detective',
 'France',
 'Personal Development',
 'Nature',
 'Witches',
 'Fiction',
 'Kids',
 'Scotland',
 'Book Club',
 'Cultural',
 'Realistic Fiction',
 'Young Readers',
 'Canada',
 'Paranormal',
 'Modern Classics',
 'Animals',
 'Academic',
 'Travel',
 'Coming Of Age',
 'Female Authors',
 'Chapter Books',
 'School',
 'Gothic Romance',
 'Short Stories',
 'Psychological Thriller',
 'Juvenile',
 'Cozy Mystery',
 'Emergency Services',
 'Audiobook',
 'Spirituality',
 'Self Help',
 'Childrens',
 'Historical',
 'Autobiography',
 'Diets',
 'Psychology',
 'Mystery',
 'Biography Memoir',
 'Romantic Suspense',
 'Biograph

In [114]:
user_recommended_books = ['Brennen muss Salem',
 'How to Know God: The Yoga Aphorisms of Patanjali',
 "It's Not Mean If It's True: More Trials from My Queer Life",
 'Night Judgement At Sinos A Novel',
 'The House on the Point: A Tribute to Franklin W Dixon and The Hardy Boys',
 'Tularosa Kevin Kerney 1']

In [115]:
recommendations = df2[df2["ISBN"].isin(user_recommended_books)]['Genres'].unique()
recommendations = [ast.literal_eval(recommendations[i]) for i in range(len(recommendations))]

In [116]:
rec_genres = set()
[[rec_genres.add(i) for i in element] for element in recommendations];
rec_genres

{'Action',
 'Adult',
 'Adventure',
 'Audiobook',
 'Autobiography',
 'Buddhism',
 'Classics',
 'Crime',
 'Cultural',
 'Eastern Philosophy',
 'Emergency Services',
 'Espionage',
 'Essays',
 'Fantasy',
 'Fiction',
 'Gay',
 'Hinduism',
 'Horror',
 'Humor',
 'India',
 'LGBT',
 'Memoir',
 'Mystery',
 'Mystery Thriller',
 'Nonfiction',
 'Novels',
 'Paranormal',
 'Philosophy',
 'Police',
 'Psychology',
 'Queer',
 'Religion',
 'Spirituality',
 'Spy Thriller',
 'Supernatural',
 'Suspense',
 'Thriller',
 'Vampires',
 'Westerns',
 'Writing'}

In [117]:
read_genres.intersection(rec_genres)

{'Adult',
 'Adventure',
 'Audiobook',
 'Autobiography',
 'Buddhism',
 'Classics',
 'Crime',
 'Cultural',
 'Emergency Services',
 'Fantasy',
 'Fiction',
 'Humor',
 'Memoir',
 'Mystery',
 'Mystery Thriller',
 'Nonfiction',
 'Novels',
 'Paranormal',
 'Philosophy',
 'Police',
 'Psychology',
 'Religion',
 'Spirituality',
 'Supernatural',
 'Suspense',
 'Thriller'}

The model was able to pickup on a lot of user 55862's preferences such as, however it does miss others. A more detailed look at book ratings for the recommended and missed genres is required. Further, users with a lesser number of read books require analyzing.

##### Packing the model

In [118]:
model_name = "model"
version =  "_v0.0.1"
zipped_model_name = "zipped_model"

In [119]:
import os
import shutil
# The use of the nested dirctory below is intended to ease zipping and unizpping later on
tf.saved_model.save(indexer, os.path.join(f"./{model_name}{version}", f"{model_name}{version}"))

path = os.path.join(
            f"{model_name}{version}",
            f"{model_name}{version}",
            "assets",
            "placeholder")
open(path, 'a').close()



INFO:tensorflow:Assets written to: ./model_v0.0.1\model_v0.0.1\assets


INFO:tensorflow:Assets written to: ./model_v0.0.1\model_v0.0.1\assets


In [120]:
loaded = tf.saved_model.load(os.path.join(f"./{model_name}{version}", f"{model_name}{version}"))

In [121]:
type(indexer), type(loaded)

(tensorflow_recommenders.layers.factorized_top_k.BruteForce,
 tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject)

In [122]:
_, titles = loaded(tf.constant(["55862"]))
set([titles.numpy()[0][i].decode() for i in range(len(titles.numpy()[0]))])

{'10753', '12683', '12921', '42062', '47722', '5963'}

In [123]:
_, titles = indexer(tf.constant(["703730"]))
set([titles.numpy()[0][i].decode() for i in range(len(titles.numpy()[0]))])

{'18684', '28325', '32678', '44154', '5252'}

Given the large model size, it's relevant to compress it for purposes of packaging since pruning using keras pruning tools wouldn't work on this custom tfrs Model. The target's to get it under 50 MiB to maintain on GitHub. Unzipping and loading the model takes a long time (500 ms) and does not make it suitable for live prediction meaning the model should be unzipped ideally as soon as the application goes live. Contanier orchastraion can facilitate the takedown and setup of pods where time delay in unzipping is no longer an encumbrance>

In [172]:
import os
import zipfile
import shutil
def zip_directory(folder_path: str, zip_path:str, zip: bool = True, test: bool = False):
    if zip:
        with zipfile.ZipFile(zip_path, mode='w', compression=zipfile.ZIP_DEFLATED) as zipf:
            len_dir_path = len(folder_path)
            for root, _, files in os.walk(folder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, file_path[len_dir_path:])
            zipf.close()
            shutil.rmtree(folder_path)
            if test:
                result = (os.path.isdir(folder_path) and 
                          os.path.isfile(zip_path)) # false and true
                return result == False and True
            
    else:
        if test:
            extract_path = os.mkdir(folder_path)
        else:
            extract_path = None
        with zipfile.ZipFile(file=zip_path, mode="r") as f:
            f.extractall()
            f.close()
            os.remove(zip_path)
            if test:
                result = (os.path.isfile(zip_path) and 
                            os.path.isdir(folder_path)) # false and true
                return result == False and True

Test zipping

In [173]:
zip_directory(folder_path=f"./{model_name}{version}", 
              zip_path=f"{zipped_model_name}{version}.zip",
              zip=True,
              test = True)

True

Test unzipping

In [174]:
zip_directory(folder_path=f"./{model_name}{version}", 
              zip_path=f"{zipped_model_name}{version}.zip",
              zip=False,
              test = True)

True

Test the unzipped model

In [126]:
loaded = tf.saved_model.load(f"{model_name}{version}")

In [128]:
_, titles = loaded(tf.constant(["1"]))
set([titles.numpy()[0][i].decode() for i in range(len(titles.numpy()[0]))])

{'52727', '53615'}