In [3]:
from sklearn.preprocessing import LabelEncoder
import sklearn
import tensorflow as tf
from tensorflow import keras as k
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import json
import ast
import joblib
import pickle

import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('Final_Ranker_Data.csv')

In [5]:
df.sample(3)[df.columns[1:]][:3]

Unnamed: 0,User,ISBN,User_First_Cat,User_Second_Cat,User_Third_Cat,User_Fourth_Cat,Book_First_Cat,Book_Second_Cat,Book_Third_Cat,Book_Fourth_Cat
307220,71466,25407,Mystery,Literature,Fiction,Crime,Mystery,Noir,Mystery,Fiction
605899,127097,11064,Childrens,Picture Books,Fiction,Animals,Biography,Nonfiction,European Literature,British Literature
539468,52282,44548,Food and Drink,Childrens,Cookbooks,Cooking,Food and Drink,Cookbooks,Food and Drink,Cooking


In [7]:
unique_genres = []
for i in df.columns[3:]:
    unique_genres += list(df[f'{i}'].unique() )
unique_genres = np.unique(unique_genres)
len(unique_genres)

731

In [8]:
ratings = tf.data.Dataset.from_tensor_slices(df.astype('str').values)

In [9]:
df.columns

Index(['RatingOf5', 'User', 'ISBN', 'User_First_Cat', 'User_Second_Cat',
       'User_Third_Cat', 'User_Fourth_Cat', 'Book_First_Cat',
       'Book_Second_Cat', 'Book_Third_Cat', 'Book_Fourth_Cat'],
      dtype='object')

In [10]:
ratings = ratings.map(lambda x: {
    f"{df.columns[i]}": x[i] for i in range(len(df.columns))})

In [13]:
# setup vocab 
feature_names = df.columns[1:]
vocabularies = {}
for feature_name in feature_names:
    vocab = ratings.batch(1_000_000).map(lambda x: x[feature_name])
    vocabularies[feature_name] = np.unique(np.concatenate(list(vocab)))

pickle.dump(vocabularies, open('DCN_Recomender_Vocabulary', 'wb'))

In [19]:
str_features = df.columns[1:] 

In [38]:
list(str_features)

['User',
 'ISBN',
 'User_First_Cat',
 'User_Second_Cat',
 'User_Third_Cat',
 'User_Fourth_Cat',
 'Book_First_Cat',
 'Book_Second_Cat',
 'Book_Third_Cat',
 'Book_Fourth_Cat']

In [510]:
from model_tools import DCN

In [248]:
train_size = int(0.8*len(ratings))
test_size = len(ratings) - train_size

In [249]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.skip(train_size).take(test_size)

Prepare train and test data as well as a function to run various models including a embedding + dnn, a dcn with low dimension and another dcn with higher dimensions.

In [250]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [86]:
vocabularies = pickle.load(open('DCN_Recomender_Vocabulary', 'rb'))

In [251]:
def run_models(use_cross_layer, deep_layer_sizes, projection_dim=None, num_runs=1):
    models = []
    rmses = []

    for i in range(num_runs):
        model = DCN(use_cross_layer=use_cross_layer,
                    deep_layer_sizes=deep_layer_sizes,
                    projection_dim=projection_dim,
                   str_features=str_features,
                   vocabularies=vocabularies)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
        models.append(model)

        model.fit(cached_train, epochs=epochs, verbose=False)
        metrics = model.evaluate(cached_test, return_dict=True)
        rmses.append(metrics["RMSE"])

    mean, stdv = np.average(rmses), np.std(rmses)

    return {"model": models, "mean": mean, "stdv": stdv}

In [258]:
epochs = 20
learning_rate = 0.01

In [511]:
dcn_result = run_models(use_cross_layer=True,
                        deep_layer_sizes=[192, 192])



In [330]:
dcn_lr_result = run_models(use_cross_layer=True,
                           projection_dim=20,
                           deep_layer_sizes=[192, 192])



In [331]:
dnn_result = run_models(use_cross_layer=False,
                        deep_layer_sizes=[192, 192, 192])



In [334]:
print("DCN            RMSE mean: {:.4f}, stdv: {:.4f}".format(
    dcn_result["mean"], dcn_result["stdv"]))
print("DCN (low-rank) RMSE mean: {:.4f}, stdv: {:.4f}".format(
    dcn_lr_result["mean"], dcn_lr_result["stdv"]))
print("DNN            RMSE mean: {:.4f}, stdv: {:.4f}".format(
    dnn_result["mean"], dnn_result["stdv"]))

DCN            RMSE mean: 0.8628, stdv: 0.0000
DCN (low-rank) RMSE mean: 0.9724, stdv: 0.0000
DNN            RMSE mean: 0.9401, stdv: 0.0000


##### Inference

I'll take the inference sample to a probability of the user liking the book of choice

In [482]:
sample = df.sample(1)[['User', 'ISBN']].values[0][1]

book_title_stringlookup = tf.keras.layers.StringLookup(mask_token=None)
book_title_stringlookup.set_weights(np.load("ranker_book_titles_vocabulary.npy", allow_pickle=True))
book_id = book_title_stringlookup.call(tf.constant([str(sample)]))

le_isbn = joblib.load('ranker_le_isbn')
book_title = le_isbn.inverse_transform(book_id)
book_title

array(['The Best American Nonrequired Reading 2003'], dtype=object)

In [505]:
# I'll see the probability of the following sampled user being interested in the sample book above
user_arr = df[df.User == int(df.sample(1)['User'])].values[0]
book_arr = df[df.ISBN == book_id].values[0]
for i in [1,3,4,5,6]:
    book_arr[i:i+1] = user_arr[i]
book_arr[1:]

array([97957, 37522, 'Writing', 'Essays', 'Science', 'Environment',
       'Fiction', 'Anthologies', 'Short Stories', 'Literature'],
      dtype=object)

In [506]:
book_arr = tf.data.Dataset.from_tensor_slices(book_arr.astype('str').reshape(1,-1))
inference_sample = book_arr.map(lambda x: {
    f"{df.columns[i]}": x[i] for i in range(len(df.columns))})
for inference_sample in inference_sample.take(1):
    break;

In [507]:
dcn_result['model'][0].call(inference_sample)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[3.0650332]], dtype=float32)>

In [508]:
dcn_lr_result['model'][0].call(inference_sample)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[4.4947515]], dtype=float32)>

In [509]:
dnn_result['model'][0].call(inference_sample)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[4.065869]], dtype=float32)>

- The logits may surpass the actual rating of 5 since this was a continuous value prediction task and not a classification task.
- The relative value of logits among the models will tell us how accurate the model is. The higher the logit the more likely the model assumers that the user will like the book
- In the above example, *the higher rank DCN is better able to capture teh divergence of the user's preferences ['Writing', 'Essays', 'Science', 'Environment'] vs. those of the book of interest ['Fiction', 'Anthologies', 'Short Stories', 'Literature'] which constrast quite a bit*. The **dcn (high rank) model was able to capture this with a lower logit prediction than those of the dcn_lr and dnn meaning the user is unlikely to be interested in this book** 