In [3]:
from sklearn.preprocessing import LabelEncoder
import sklearn
import tensorflow as tf
from tensorflow import keras as k
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
import pandas as pd
import numpy as np
import json
import ast
import joblib

Importing and organizing the raw data 

In [None]:
# import os
# import json
# data = list()
# for file in os.listdir("./data"):
##     if file != "results_20000-24861.json":
#     with open("./data/"+file, 'rb') as f:
#         print(f"Read {file}")
#         data_ = json.load(f)
#         data += data_
##     else:
##         continue

In [None]:
# records = [{"User": record.get('user_id'), 
#            "ISBN": record.get('book_title'),
#            "RatingOf5": record.get('stars'),
#            "Genres": record.get('genres')
#           } for record in data]
# df = pd.DataFrame(records)
# df.to_csv("USER_ISBN_RATING_GENRES.csv")

In [None]:
df = pd.read_csv("../../USER_ISBN_RATING_GENRES.csv")
df.drop(columns=["Unnamed: 0"], inplace = True)

In [None]:
print("Total numer of unique books: ", len(df["ISBN"].unique()))
print("Total numer of unique users: ", len(df["User"].unique()))
print("Total numer of reviews: ", len(df))
print("Total numer of positive reviews: ", len(df[df["RatingOf5"] > 3]))
print("Total numer of non-positive reviews: ", len(df[df["RatingOf5"] < 3]))
print("Total numer of moderate reviews: ", len(df[df["RatingOf5"] == 3]))
print("Total numer of  non-reviews: ", sum(df["RatingOf5"].isna()))

Total numer of unique books:  57198
Total numer of unique users:  706880
Total numer of reviews:  1733561
Total numer of positive reviews:  936796
Total numer of non-positive reviews:  201628
Total numer of moderate reviews:  372374
Total numer of  non-reviews:  222763


Although I'll use a TF stringlookup object to encode users, to keep users anonymous for the data exploration stage, I'll label encode them as IDs

In [None]:
le = LabelEncoder()
le.fit(df["User"])
transformed = le.transform(df["User"])
len(transformed), len(np.unique(transformed, return_counts=False))

(1733561, 706880)

In [None]:
joblib.dump(le, "ranker_fitted_label_encoder.pkl")

['ranker_fitted_label_encoder.pkl']

In [None]:
le = joblib.load("ranker_fitted_label_encoder.pkl")
transformed = le.transform(df["User"])

In [None]:
df["User"] = transformed
df["User"].head()

0    423114
1    241063
2    659623
3    557019
4     44930
Name: User, dtype: int32

In [None]:
df2 = df.copy()

**I'm going to reserve a random sample of 10 observations to test recommendations as observations that the model has not seen before when developing the production package.**

In [None]:
df.dropna(inplace=True)

In [None]:
sample = df.groupby("ISBN").count()[df.groupby("ISBN").count()["User"] > 5]["User"]. \
    sort_values(ascending=False).sample(20).index
loop = True
while True:
    new_sample=df[df["ISBN"].isin(sample)].sample(10)['ISBN']
    if len(new_sample.unique()) == 10:
        sample_to_keep = new_sample
        break

In [None]:
sample = df[df.index.isin(sample_to_keep.index)][['User', 'ISBN', 'RatingOf5']]
index_to_drop = sample.index
sample.to_csv('ranker_test.csv', index=False)
df.drop(index=index_to_drop, axis=0, inplace=True)

In [None]:
df[df.index.isin(index_to_drop)]

Unnamed: 0,User,ISBN,RatingOf5,Genres


In [None]:
pd.read_csv('ranker_test.csv')[['User', 'ISBN']]

Unnamed: 0,User,ISBN
0,320727,What Went Wrong at Enron: Everyone's Guide to ...
1,625088,Guerrilla Learning: How to Give Your Kids a Re...
2,618800,Eastern Sun Winter Moon: An Autobiographical O...
3,411288,Raspberry Crush
4,198921,Pooh: Just Be Nice AND NOT TOO ROUGH SPECIAL E...
5,258203,The Life and Adventures of Nicholas Nickleby V...
6,336651,Silver Flame
7,684707,Nothing That Meets the Eye: The Uncollected St...
8,597291,Das Tor ins Nichts Der Magier 2
9,404510,Berserk: Motiveless Random Massacres


In [None]:
df[["User", "ISBN", "RatingOf5", "Genres"]].to_csv('ranker_train_data.csv', index=False)

In [None]:
pd.read_csv('ranker_train_data.csv', nrows=5)[["User", "ISBN"]]

Unnamed: 0,User,ISBN
0,423114,Kiss Hollywood Goodbye
1,241063,Kiss Hollywood Goodbye
2,659623,Kiss Hollywood Goodbye
3,557019,Kiss Hollywood Goodbye
4,44930,Kiss Hollywood Goodbye


In [None]:
df = pd.read_csv('ranker_train_data.csv')

In [None]:
df.isna().sum()

User         0
ISBN         0
RatingOf5    0
Genres       0
dtype: int64

#### Reducing dimentionality

With ~ 700k users and 60k books, the embeddings for the two embedding "towers" that the recommender model is going to be based on will generately an extremely large model. Practically, I would like to demonstrate the end-to-end data science pipeline by being able to package my model and make it available publicly for my API to access as well as for those interested which is why reduce the recommendations to those users who given more than 2 review.

In [None]:
grouped_data = df.groupby("User").count()
grouped_data[grouped_data["ISBN"] > 2]['ISBN']

User
0         41
7          3
8          4
25         3
31         3
          ..
706863    12
706867     6
706870     6
706871     4
706879     4
Name: ISBN, Length: 115058, dtype: int64

In [None]:
idx_to_keep = grouped_data[grouped_data["ISBN"] > 2].index
set(idx_to_keep) == set(df[df["User"].isin(idx_to_keep)]["User"])

True

In [None]:
df = df[df.User.isin(idx_to_keep)]
df.sample(5)[['User', 'ISBN']]

Unnamed: 0,User,ISBN
1265091,461604,The Language of Letting Go: Daily Meditations ...
904811,171370,Confessional Liam Devlin 3
33619,307367,What Happened to Patrick's Dinosaurs
166922,444018,I Sing the Body Electronic: A Year with Micros...
759433,442750,Let's Fly a Kite Charlie Brown A Book About th...


At this point we have an index of user that have more than 2 reviews (of different books). Our new DataFrame object will only comprise those users and their respectively reviewed books

In [None]:
# also ensure that the grouping and querying was sane
print("# of unique users remaining: ", len(df.groupby('User').count()))

# also check out how many unique books were lost in the process of trimming 
# down the dataset (only about 500)

print("# of unique books remaining: ", len(df.groupby('ISBN').count()))

    

# of unique users remaining:  115058
# of unique books remaining:  54466


In [None]:
# we can label encode the book ID's if required, otherwise, a TF stringlookup object can handle 
# a variety of string encodings well - as will follow

le_isbn = LabelEncoder()
df["ISBN"] = le_isbn.fit_transform(df["ISBN"])

In [None]:
df.User = df.User.astype(np.int64)
df.ISBN = df.ISBN.astype(np.int64)
df.RatingOf5 = df.RatingOf5.astype(np.int64)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924370 entries, 0 to 1510787
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   User       924370 non-null  int64 
 1   ISBN       924370 non-null  int64 
 2   RatingOf5  924370 non-null  int64 
 3   Genres     924370 non-null  object
dtypes: int64(3), object(1)
memory usage: 35.3+ MB


Setting up tensors to batch for purposes of training the embeddings 

In [None]:
books = tf.data.Dataset.from_tensor_slices(df['ISBN'].astype('str').values)
ratings = tf.data.Dataset.from_tensor_slices(df[['User', 'ISBN', 'RatingOf5']].astype('str').values)

In [None]:
train_size = int(len(df)*0.8)
train = tf.data.Dataset.from_tensor_slices(
    df[["ISBN", "User", "RatingOf5"]][:train_size].astype('str').values).shuffle(100000)
test = tf.data.Dataset.from_tensor_slices(
    df[["ISBN", "User", "RatingOf5"]][train_size:].astype('str').values).shuffle(10000)

In [None]:
for i in test.take(1):
    print(i)
    break

tf.Tensor([b'18368' b'85574' b'5'], shape=(3,), dtype=string)


String look up creates a vocabulary dictionary of string-value pairs since I'll be embedding my ISBN's and users ID's

In [None]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings.map(lambda x: x[0]))

book_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
book_titles_vocabulary.adapt(ratings.map(lambda x: x[1]))

In [None]:
np.save("ranker_user_ids_vocabulary", user_ids_vocabulary.get_weights())
np.save("ranker_book_titles_vocabulary", book_titles_vocabulary.get_weights())

In [4]:
new_user_id_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_user_id_stringlookup.set_weights(np.load("ranker_user_ids_vocabulary.npy", allow_pickle=True))

new_book_title_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
new_book_title_stringlookup.set_weights(np.load("ranker_book_titles_vocabulary.npy", allow_pickle=True))

In [5]:
# user and book embedding model objects created outside the main ranking model

embedding_dimension = 32
user_model = tf.keras.Sequential([
    new_user_id_stringlookup,
    tf.keras.layers.Embedding(new_user_id_stringlookup.vocabulary_size()+1 , embedding_dimension)
])

book_model = tf.keras.Sequential([
    new_book_title_stringlookup,
    tf.keras.layers.Embedding(new_book_title_stringlookup.vocabulary_size()+1, embedding_dimension)
])

In [1]:
# functions to support transforming tabular data into tensor data in listwise format 
# adapted from https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/examples/movielens.py
from listwise_utility import (sample_listwise, 
                              RankingModel)

In [None]:
path1 = "./sample1"
path2 = "./sample2"

In [None]:
train_ds = sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42)

In [None]:
tf.data.experimental.save(
    train_ds, path, compression='GZIP'
)
with open(path1 + '/element_spec1', 'wb') as out_:  # also save the element_spec to disk for future loading
    pickle.dump(sample1.element_spec, out_)

In [None]:
test_ds = sample_listwise(
    test,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42)

In [19]:
tf.data.experimental.save(
    test_ds, path, compression='GZIP'
)
with open(path2 + '/element_spec2', 'wb') as out_:  # also save the element_spec to disk for future loading
    pickle.dump(sample2.element_spec, out_)

I upload these datasets to Google Colab since my OS doesn't support tensorflow_ranking which features loss functions import to trainig listwise rankers.

Some setup for training on Google Colab

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
!ls gdrive/MyDrive/sample1/
!ls gdrive/MyDrive/sample2/

1046517602870779941   dataset_spec.pb  element_spec1
14239500212799276046  element_spec     snapshot.metadata
10931280481293917633  dataset_spec.pb  element_spec2  snapshot.metadata


In [18]:
path = "./gdrive/MyDrive/sample1" # train
path = "./gdrive/MyDrive/sample2" # test

In [19]:
import pickle
with open(path + '/element_spec1', 'rb') as in_:
    es = pickle.load(in_)

train_ds = tf.data.experimental.load(
    path, es, compression='GZIP'
)

In [21]:
epochs = 5
cached_train = train_ds.batch(8192).cache()
cached_test = test_ds.batch(4096).cache()

In [55]:
for i in cached_test.take(1):
  print(i)
#   j = {'user_id': i['user_id'][0],'book_title': i['book_title'][0], 'user_rating': i['user_rating'][0]}
  break

{'user_id': <tf.Tensor: shape=(4096,), dtype=string, numpy=
array([b'31553', b'31553', b'31553', ..., b'7027', b'7027', b'7027'],
      dtype=object)>, 'book_title': <tf.Tensor: shape=(4096, 5), dtype=string, numpy=
array([[b'452505', b'303111', b'419904', b'47586', b'605443'],
       [b'382244', b'357230', b'419904', b'594747', b'452505'],
       [b'176086', b'357230', b'602363', b'382244', b'47586'],
       ...,
       [b'229307', b'262045', b'359386', b'406877', b'240836'],
       [b'359386', b'582693', b'85789', b'229307', b'596214'],
       [b'366356', b'229307', b'359386', b'265191', b'76474']],
      dtype=object)>, 'user_rating': <tf.Tensor: shape=(4096, 5), dtype=string, numpy=
array([[b'5', b'3', b'4', b'5', b'4'],
       [b'5', b'4', b'4', b'3', b'5'],
       [b'4', b'4', b'3', b'5', b'5'],
       ...,
       [b'4', b'4', b'5', b'5', b'5'],
       [b'5', b'5', b'4', b'4', b'4'],
       [b'3', b'4', b'5', b'4', b'4']], dtype=object)>}


##### MSE Model

In [22]:
# Ranking model that optimizes based on either MSE or an NDCGMetric from tensorflow_ranking
mse_model = RankingModel(tf.keras.losses.MeanSquaredError(), user_model, book_model)
mse_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [23]:
mse_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f410e399990>

##### Hinge Model

In [25]:
hinge_model = RankingModel(tfr.keras.losses.PairwiseHingeLoss(), user_model, book_model)
hinge_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [27]:
hinge_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f40f9a40f90>

##### Listwise Model

In [28]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss(), user_model, book_model)
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [29]:
listwise_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f40f9a68690>

##### Evaluating models using various loss functions

In [30]:
mse_model_result = mse_model.evaluate(cached_test, return_dict=True)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


In [31]:
hinge_model_result = hinge_model.evaluate(cached_test, return_dict=True)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


In [32]:
listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


In [33]:
print("NDCG of the MSE Model: {:.4f}".format(mse_model_result["ndcg_metric"]))
print("NDCG of the pairwise hinge loss model: {:.4f}".format(hinge_model_result["ndcg_metric"]))
print("NDCG of the ListMLE model: {:.4f}".format(listwise_model_result["ndcg_metric"]))

NDCG of the MSE Model: 0.8556
NDCG of the pairwise hinge loss model: 0.8563
NDCG of the ListMLE model: 0.8563


In [None]:
mse_model.save('./gdrive/MyDrive/mse_model')
hinge_model.save('./gdrive/MyDrive/hingle_model')
listwise_model.save('./gdrive/MyDrive/listwise_model')

In [2]:
# inference based on the sample from the test set
# taken earlier based no a single observation from the batch (1 user, 5 movies)

for j in range(5):
  print(listwise_model(i)[0,j,:]) 
