In this notebook I will build off of my previous work with the TFRS pipline to simplify the model's towers and to also improve the users' embeddings by incorperating additional session metrics to pass through.

In [1]:
import os

os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [2]:
!pip install -q tensorflow-recommenders
!pip install -q plotnine

In [3]:
import s3fs

import io
import datetime
import json

import random

from typing import List, Union, Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

import plotnine
import gdown


from sklearn.model_selection import train_test_split

2025-04-14 00:16:58.631970: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-14 00:16:58.656358: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Ensure GPUs are visible
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

if gpus:
    # Set memory growth to avoid allocation errors
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

# Set logical device configuration for CPU
cpus = tf.config.list_physical_devices('CPU')
if cpus:
    tf.config.set_logical_device_configuration(
        cpus[0],
        [tf.config.LogicalDeviceConfiguration()]
    )

print("Logical devices configured.")


Available GPUs: []
Logical devices configured.


Initialize access to s3 bucket

To start off we import the datasets for books and their reviews, performing some data cleaning

In [5]:
# Import books and reviews dataset
# books_data_location = 's3://w210recsys/book_raw/books_data.csv'
#review_location = 's3://w210recsys/book_raw/Books_rating.csv'

books_data_location = "s3://w210recsys/book_clean/books_data_clean.pkl"

In [6]:
books_df = pd.read_pickle(books_data_location) 

books_df.shape

(321301, 17)

Data Cleansing

In [7]:
books_df.isnull().sum()

user_id                    0
book_id                    0
title                      0
author                     0
publish_year             842
description                0
preview_link               0
normalized_popularity      0
genre_general              0
genre_specific             0
genre_combined             0
genre_consolidated         0
review_helpfulness         0
review_score               0
review_time                0
review_summary             1
review_text                0
dtype: int64

## Augment Dataset (Do Not Run)

As was demonstrated by Ben's EDA on user rating, we see an overwheling amount of ratings are positively skewwed and similarly that many people only have a few review they ever leave. The twin tower model will perform better if it gets example of both what the user likes and what they don't like so we want to augment user's reviews with books they did not interact with compared to the ones they did.

In [None]:
# def aug_data(catalog, user_data_dict, user_last_dates, k_per_user):
#     """
#     Returns a new book sample set from the catalog for multiple users at once, ensuring dates are not earlier than 
#     the last review date per user.

#     Parameters:
#     - catalog (pd.DataFrame): Full book catalog.
#     - user_data_dict (dict): A dictionary mapping users to a set of interacted books.
#     - user_last_dates (dict): A dictionary mapping users to their last review date.
#     - k_per_user (int): Number of new books per user.

#     Returns:
#     - pd.DataFrame: A DataFrame with new book samples for all users.
#     """

#     # Flatten all interacted books into a set (fast filtering)
#     interacted_books = set.union(*user_data_dict.values())

#     # Efficient filtering: Remove all interacted books at once
#     filtered_catalog = catalog[~catalog[['title', 'author']].apply(tuple, axis=1).isin(interacted_books)]

#     # Prepare a list to store new samples
#     new_samples = []

#     for user, _ in user_data_dict.items():
#         # Randomly sample `k_per_user` books
#         sampled_books = filtered_catalog.sample(n=min(k_per_user, len(filtered_catalog)), random_state=42).copy()
#         sampled_books['user_id'] = user  # Assign user ID
#         sampled_books['review_score'] = 0  # The user didn't interact with it

#         # Get the last review date for this user
#         last_review_date = user_last_dates.get(user, pd.Timestamp.now())  # Default to now if no history
        
#         # Ensure last_review_date is a pandas Timestamp (datetime64)
#         last_review_date = pd.Timestamp(last_review_date)
        
#         # Generate random timedelta and subtract from last_review_date
#         sampled_books['review_time'] = last_review_date - pd.to_timedelta(
#             [random.randint(1, 30) for _ in range(len(sampled_books))], unit="D"
#         )

#         new_samples.append(sampled_books)

#     # Concatenate all samples into a single DataFrame
#     return pd.concat(new_samples, ignore_index=True)

# # Sample Data
# temp = books_df#.iloc[0:100]

# # Step 1: Convert user interactions into a dictionary {user_id: {(title, author), ...}}
# user_data_dict = (
#     temp.groupby('user_id')[['title', 'author']]
#     .apply(lambda df: set(df.itertuples(index=False, name=None)))
#     .to_dict()
# )

# # Step 2: Extract last review date per user
# user_last_dates = (
#     temp.groupby('user_id')['review_time']
#     .max()
#     .to_dict()
# )

# # Step 3: Call optimized function for batch augmentation
# augmented_data = aug_data(books_df, user_data_dict, user_last_dates, k_per_user=3)

# # Step 4: Append new recommendations to merged_df efficiently
# temp = pd.concat([temp, augmented_data], ignore_index=True)

# print(f"Augmented dataset size: {temp.shape}")


Save the data to pickle file in the s3 bucket

In [None]:
# temp.to_pickle("s3://w210recsys/aug_data/clean_augmented_data_v1.pkl")

Load the data to avoid overheads

In [None]:
# merged_df = pd.read_pickle("s3://w210recsys/aug_data/clean_augmented_data_v1.pkl")

# merged_df.shape

In [None]:
# merged_df[merged_df['user_id'] == 'A1SMFD252FTJP9']

## Session-ize Data + Split Dataset for Validation based on Users
In the earlier versions of our twin-tower recommendations model we were splitting the data based on dates to isolate the last interaction as our validation data and passing in other interactions, line by line, in as training data. However, upon review we realized that this approach has some flaws:

1. Data Leakage - By passing in the same users in training and testing we may be getting an inflated sense of how good the model is doing.
2. Line by line data - The goal of our recommendation system is to take in a user's metrics at once and provide a user embedding that will likely set them closer tot he vooks they like in the embedding space. However, by passing in data line by line, we don't aggregate this data in the same way and the model may not be learning that

For these reasons, we opt to split the data by users, holding out their last interaction as the label, rather than by date.

Session-izing Data:

We want to group the historical data for each user in a way that allows us to mimic the session data we will collect from users in deployment. The basic structure of which will be to summarize past interactions and hold out a separate book interaction.

In [None]:
# # Test on a smaller sub-set of the data

# merged_df = merged_df.sort_values(by=['user_id', 'review_date'])


In [None]:
# temp.columns

In [None]:
# temp = merged_df[merged_df['user_id'] == 'A1SMFD252FTJP9']

In [None]:
# def session_summary(user_data):

#     """
#     session_summary takes in each user's session data and returns a summarized verison of it

#     inputs:
#     user_data: user's interaction data
#     interest_cols: columns of interest wanting to be summarized

#     outputs:
#     summarized_data: the user's summarized data

#     """

#     user_data = user_data.sort_values(by=['user_id', 'review_time'])  # Sort by user & time
    
#     session_data = []
    
#     for user, user_df in user_data.groupby('user_id'):
#         if len(user_df) < 2:
#             continue  # Skip users with only one interaction
        
#         # Last interaction is the target (book user last interacted with)
#         target_row = user_df.iloc[-1]
#         target_book = target_row['title']
#         target_book_rating = target_row['review_score']
        
#         # Previous interactions (session history3
#         history_df = user_df.iloc[:-1]  # Exclude last row]

#         summary = {
#             'user_id': user,
#             'liked_books': list(history_df.loc[history_df['review_score'] >= 3, 'title']),
#             'disliked_books': list(history_df.loc[history_df['review_score'] < 3, 'title']),
#             'liked_genres': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] >= 3, 'genre_consolidated'])))),
#             'disliked_genres': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] < 3, 'genre_consolidated'])))),
#             'liked_authors': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] >= 3, 'author'])))),
#             'disliked_authors': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] < 3, 'author'])))),
#             'liked_ratings': list(history_df.loc[history_df['review_score'] >= 3, 'review_score']),
#             'disliked_ratings': list(history_df.loc[history_df['review_score'] < 3, 'review_score']),
#             'target_book': target_book,
#             'target_book_rating': target_book_rating
#         }
        
#         session_data.append(summary)
    
#     return pd.DataFrame(session_data)

# # Generate session summaries with held-out target sample
# # session_summary(temp)

# # Generate a full df of session summaries
# # sessionized_df = session_summary(merged_df)
# sessionized_df = session_summary(sampled_books)

In [None]:
# sessionized_df.to_pickle("s3://w210recsys/aug_data/cleaned_sessionized_data_v1.pkl")

To run this function in a timely manner I had to leverage a much larger compute instance than the one this notebook was created on. I saved the result to the team s3 bucket

## Load Sessionized Data

In [9]:
sessionized_df = pd.read_pickle("s3://w210recsys/aug_data/cleaned_sessionized_data_v1.pkl")

In [10]:
sessionized_df.shape

(15839, 11)

In [11]:
sessionized_df.head()

Unnamed: 0,user_id,liked_books,disliked_books,liked_genres,disliked_genres,liked_authors,disliked_authors,liked_ratings,disliked_ratings,target_book,target_book_rating
0,A100NGGXRQF0AQ,[Lenin's Tomb: The Last Days of the Soviet Emp...,"[Mathematics and Sex, Spook: Science Tackles t...","[History / Social History, Nature / General, S...","[Family & Relationships / General, Psychology ...","[Primo Levi, Andrew Newberg, Alison Gopnik, Va...","[Mary Roach, Renee Fredrickson, Stephen J. Ceci]","[5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, 4.0, 3.0, ...","[2.0, 2.0, 2.0]",The Official Soviet SVD Manual,4.0
1,A100YHBWL4TR4D,"[The Book of Three, Drums of Autumn Hardcover ...",[Highland Desire],"[Juvenile Fiction / General, Fiction / Sea Sto...",[Fiction / Literary],"[Lloyd Alexander, Carlos Ruiz Zafon, F. Scott ...",[Amy Jarecki],"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0]",[1.0],Tea Rose,5.0
2,A101446I5AWY0Z,"[1632 (The Assiti Shards), Anonymous Rex, Hist...","[The Starchild Trilogy, Washington Goes To War...","[History / Historiography, History / Maritime ...","[Fiction / Science Fiction, History / Indigeno...","[John Garth, Eric Flint, Roy Adkins, Herwig Wo...","[Barry Fell, Gunther E. Rothenberg, Ian W. Tol...","[3.0, 4.0, 3.0, 4.0, 3.0, 4.0, 3.0, 4.0, 3.0, ...","[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]",The Congress of Vienna: A Study in Allied Unit...,4.0
3,A1016MYYF5QSTY,"[1 Ragged Ridge Road, Pies (Company's Coming),...",[Buster Midnight's Cafe],"[Cooking / General, Travel / General, Fiction ...",[Fiction / Noir],"[Leonard Foglia, Marguerite Henry, Jean Paré, ...",[Sandra Dallas],"[4.0, 5.0, 5.0, 3.0, 5.0, 5.0, 5.0]",[2.0],"A Sudden, Fearful Death",4.0
4,A101BVV4DR3G81,"[Memories, Dreams, Reflections, Complete Tai C...",[The Great Stillness: The Water Method of Taoi...,"[Drama / Shakespeare, History / General, Ficti...","[Body, Mind & Spirit / General]","[Dan Docherty, B. K. S. Iyengar, Tom Stoppard,...",[Bruce Kumar Frantzis],"[5.0, 3.0, 5.0, 3.0, 5.0, 5.0, 5.0]",[1.0],Travesties,5.0


## Augment Data

Down the line I realized that when training we'll want to pass in more than just the book's title to our book tower so I want to augment the dataset with the author and the summaries of all of those books

In [12]:
books_df.columns

Index(['user_id', 'book_id', 'title', 'author', 'publish_year', 'description',
       'preview_link', 'normalized_popularity', 'genre_general',
       'genre_specific', 'genre_combined', 'genre_consolidated',
       'review_helpfulness', 'review_score', 'review_time', 'review_summary',
       'review_text'],
      dtype='object')

In [13]:
from tqdm import tqdm

# Keep just the first instance of each book since we only care about the books' metadata, which should be the same regardless
books_df_unique = books_df.drop_duplicates(subset=['title'], keep='first')

# Convert books_df to a dictionary for fast lookup
books_dict = books_df_unique.set_index('title')[['author', 'description', 'genre_consolidated']].to_dict(orient='index')

target_book_author = []
target_book_summary = []
target_book_categories = []

for title in tqdm(sessionized_df['target_book'], desc="Processing Books"):
    book_info = books_dict.get(title, {'author': '', 'description': '', 'genre_consolidated': ''})

    target_book_author.append(book_info['author'])
    target_book_summary.append(book_info['description'])
    target_book_categories.append(book_info['genre_consolidated'])


Processing Books: 100%|██████████| 15839/15839 [00:00<00:00, 696084.21it/s]


In [14]:
# IMPORTANT NOTE: Though I'm keeping the column name as 'categories' it should reflect 'genre_consolidated' at all times henceforth

sessionized_df['authors'] = target_book_author
sessionized_df['description'] = target_book_summary
sessionized_df['categories'] = target_book_categories

Now that we have the sessionized data, we can move into the splitting of data

In [None]:
# train_df, test_df = train_test_split(sessionized_df, test_size=0.3, random_state=42)

In [None]:
# train_df.columns

## Converting string input to numerical

So the model, when running on CPUs, is able to take care of string entries to numerical via our lookup layers internally. However, when trying to utilize the GPU for faster training, it appears as through string tensor conversions from CPU to GPU aren't supported. As a result, I'm looking into converting our string entries to numerical values prior to passing it into the model.

In [15]:
# Extract unique values for user and book metadata
unique_user_ids = sessionized_df['user_id'].astype(str).unique().tolist()
unique_book_titles = books_df['title'].astype(str).unique().tolist()
unique_genres = books_df['genre_consolidated'].astype(str).unique().tolist()
unique_authors = books_df['author'].astype(str).unique().tolist()
unique_summaries = books_df['description'].astype(str).unique()

In [16]:
len(unique_user_ids), len(unique_book_titles), len(unique_genres), len(unique_authors)

(15839, 71696, 102, 50079)

In [17]:
embedding_dimensions = 64 # 64

# Create a StringLookup layer for user_id
user_id_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids,
    mask_token=None,
    name='user_id_vocab'
)

# Create an Embedding layer for user_id
user_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_user_ids) + 1, 
    output_dim=embedding_dimensions, 
    name='user_id_embedding'
)

# Create a StringLookup layer for book_title
book_title_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_book_titles,
    mask_token=None,
    name='book_title_vocab'
)

# Create an Embedding layer for book_title
book_title_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_book_titles) + 1, 
    output_dim=embedding_dimensions, 
    name='book_title_embedding'
)

# Create a StringLookup layer for genre
book_genre_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_genres,
    mask_token=None,
    name='book_genre_vocab'
)

# Create an Embedding layer for genre
book_genre_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_genres) + 1, 
    output_dim=embedding_dimensions, 
    name='book_genre_embedding'
)

# Create a StringLookup layer for authors
book_authors_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_authors,
    mask_token=None,
    name='book_authors_vocab'
)

# Create an Embedding layer for authors
book_authors_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_authors) + 1, 
    output_dim=embedding_dimensions, 
    name='book_authors_embedding'
)

# Create a StringLookup layer for description
book_description_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_summaries,
    mask_token=None,
    name='book_description_vocab'
)

# Create an Embedding layer for descriptions
book_description_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_summaries) + 1, 
    output_dim=embedding_dimensions, 
    name='book_description_embedding'
)

# Save the StringLookUp layers!

In [None]:
config = {
    "vocabulary": unique_user_ids,
    "oov_token": user_id_vocab_layer.oov_token
}

with open('user_id_vocab_layer.json', 'w') as f:
    json.dump(config, f)

In [None]:
# A014566028TLL40XCY1YR

with open("user_id_vocab_layer.json", 'r') as f:
    config = json.load(f)

user_id_vocab_layer = tf.keras.layers.StringLookup(vocabulary=config['vocabulary'], oov_token=config['oov_token'])

input_data = "A3A48XEYWLWH7T"

output = user_id_vocab_layer(input_data)
print(output)

In [None]:
int(output)

In [None]:
# Initialize S3 client
import boto3
# import subprocess
# import os
# import pickle
# import joblib
# import tarfile
# import shutil
import sagemaker

role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
s3 = boto3.client('s3')

In [None]:
# s3://w210recsys/model/recModel/modelFiles/
bucket_name="w210recsys"
key_prefix="model/recModel/modelFiles"
s3_response = sm_session.upload_data("book_authors_vocab_layer.json", bucket=bucket_name, key_prefix=key_prefix)

# Convert the information from books_df to be numerical using the string lookups

In [18]:
numerical_books_df = books_df[['title', 'author', 'genre_consolidated', 'description']].copy()
numerical_books_df.columns = ['title', 'authors', 'categories', 'description']

numerical_books_df = numerical_books_df.drop_duplicates(subset=['title'], keep='first')


numerical_books_df

Unnamed: 0,title,authors,categories,description
0,Jane Eyre (Everyman's Classics),Charlotte Brontë,Fiction / Mystery & Detective,Jane Eyre (1847) has enjoyed huge popularity s...
216,Pride & Prejudice (Penguin Classics),Ibi Zoboi,Young Adult Fiction / General,In a timely update of Jane Austen's Pride and ...
524,To Kill a Mockingbird,Harper Lee,Drama / General,Harper Lee's classic novel of a lawyer in the ...
954,Harry Potter and The Sorcerer's Stone,J. K. Rowling,Juvenile Fiction / Fantasy & Magic,Celebrate 20 years of Harry Potter magic! Harr...
1548,The Hobbit,J. R. R. Tolkien,Juvenile Fiction / General,Celebrating 75 years of one of the world's mos...
...,...,...,...,...
321296,Seeking His Mind (Voice from the Monastery),Michael Casey,Religion / General,"Michael Casey, a monk and scholar who has been..."
321297,Jane's Marine Propulsion,Keith Henderson,Technology & Engineering / General,Jane's Marine propulsion is an exceptional one...
321298,Across the Bridge,John Lewis,Political Science / General,Winner of the NAACP Image Award for Outstandin...
321299,In the Moons of Borea,Brian Lumley,Fiction / World Literature,Titus Crow and his companions continue their b...


In [19]:
# Apply the lookup transformation to each column

numerical_books_df['title'] = book_title_vocab_layer(numerical_books_df['title']).numpy()
numerical_books_df['authors'] = book_authors_vocab_layer(numerical_books_df['authors']).numpy()
numerical_books_df['categories'] = book_genre_vocab_layer(numerical_books_df['categories']).numpy()
numerical_books_df['description'] = book_description_vocab_layer(numerical_books_df['description']).numpy()

# Also add in the 'book_id'
numerical_books_df['book_id'] = [i for i in range(0, numerical_books_df.shape[0])]

numerical_books_df

Unnamed: 0,title,authors,categories,description,book_id
0,1,1,1,1,0
216,2,2,2,2,1
524,3,3,3,3,2
954,4,4,4,4,3
1548,5,5,5,5,4
...,...,...,...,...,...
321296,71692,50078,16,69555,71691
321297,71693,50079,69,69556,71692
321298,71694,37457,8,69557,71693
321299,71695,14930,6,69558,71694


In [20]:
numerical_sessionized_df = sessionized_df.copy()

numerical_sessionized_df.columns

Index(['user_id', 'liked_books', 'disliked_books', 'liked_genres',
       'disliked_genres', 'liked_authors', 'disliked_authors', 'liked_ratings',
       'disliked_ratings', 'target_book', 'target_book_rating', 'authors',
       'description', 'categories'],
      dtype='object')

In [21]:
len(numerical_sessionized_df)

15839

In [None]:
numerical_sessionized_df.head()

In [22]:
# Function to apply lookup layer in a vectorized manner
def fast_lookup_list(values, lookup_layer):
    values = values.apply(lambda x: x if isinstance(x, list) else ["UNKNOWN"])  # Ensure lists
    tensor_input = tf.ragged.constant(values.tolist(), dtype=tf.string)  # Convert to ragged tensor
    return lookup_layer(tensor_input).numpy().tolist()  # Apply lookup in batch and convert to list

# Apply lookup in bulk for better performance
numerical_sessionized_df['user_id'] = user_id_vocab_layer(numerical_sessionized_df['user_id'].astype(str)).numpy()

numerical_sessionized_df['liked_books'] = fast_lookup_list(numerical_sessionized_df['liked_books'], book_title_vocab_layer)
numerical_sessionized_df['disliked_books'] = fast_lookup_list(numerical_sessionized_df['disliked_books'], book_title_vocab_layer)

numerical_sessionized_df['liked_genres'] = fast_lookup_list(numerical_sessionized_df['liked_genres'], book_genre_vocab_layer)
numerical_sessionized_df['disliked_genres'] = fast_lookup_list(numerical_sessionized_df['disliked_genres'], book_genre_vocab_layer)

numerical_sessionized_df['liked_authors'] = fast_lookup_list(numerical_sessionized_df['liked_authors'], book_authors_vocab_layer)
numerical_sessionized_df['disliked_authors'] = fast_lookup_list(numerical_sessionized_df['disliked_authors'], book_authors_vocab_layer)

numerical_sessionized_df['target_book'] = book_title_vocab_layer(numerical_sessionized_df['target_book'].astype(str)).numpy()

numerical_sessionized_df['authors'] = book_authors_vocab_layer(numerical_sessionized_df['authors'].astype(str)).numpy()

numerical_sessionized_df['description'] = book_description_vocab_layer(numerical_sessionized_df['description'].astype(str)).numpy()

numerical_sessionized_df['categories'] = book_genre_vocab_layer(numerical_sessionized_df['categories'].astype(str)).numpy()

# Increment the ratings up by 1 to leave 0 to be the padding

def rating_shift(ratings):

    if type(ratings) == list:
        if len(ratings) == 0:
            return ratings

        return [entry + 1 for entry in ratings]

    else:

        return ratings + 1


numerical_sessionized_df['liked_ratings'] = numerical_sessionized_df['liked_ratings'].apply(lambda x: rating_shift(x))
numerical_sessionized_df['disliked_ratings'] = numerical_sessionized_df['disliked_ratings'].apply(lambda x: rating_shift(x))
numerical_sessionized_df['target_book_rating'] = numerical_sessionized_df['target_book_rating'].apply(lambda x: rating_shift(x))


In [24]:
numerical_sessionized_df.head()

Unnamed: 0,user_id,liked_books,disliked_books,liked_genres,disliked_genres,liked_authors,disliked_authors,liked_ratings,disliked_ratings,target_book,target_book_rating,authors,description,categories
0,1,"[6305, 21182, 20922, 658, 5764, 13248, 6148, 1...","[23325, 1007, 14131]","[60, 50, 27, 29, 34, 68, 10, 52, 6, 33, 53]","[36, 52, 85]","[9454, 12846, 4340, 4065, 4285, 458, 14942, 15...","[692, 10080, 16638]","[6.0, 6.0, 5.0, 5.0, 6.0, 6.0, 6.0, 5.0, 4.0, ...","[3.0, 3.0, 3.0]",56223,5.0,39401,54887,82
1,2,"[491, 444, 1432, 48930, 11, 10097, 197, 33041,...",[48343],"[5, 41, 19, 6, 100, 13]",[9],"[343, 69, 9, 7160, 34411, 183, 194, 147]",[34000],"[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0]",[2.0],607,6.0,420,583,6
2,3,"[859, 11445, 16211, 11151, 3102, 8855, 55148, ...","[41712, 16775, 21684, 15588, 27706, 33754]","[71, 23, 26, 40, 60, 29, 1, 98, 6, 53, 65]","[14, 73, 1, 98, 57, 90]","[7930, 593, 3621, 11559, 8144, 6283, 29217, 17...","[19662, 23940, 11967, 1245, 1767, 11111]","[4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0]",45074,5.0,31736,44138,29
3,4,"[20216, 61316, 61811, 10629, 1880, 4407, 4964]",[8057],"[79, 80, 17, 5, 1]",[76],"[14407, 2560, 42849, 1317, 2284]",[1681],"[5.0, 6.0, 6.0, 4.0, 6.0, 6.0, 6.0]",[3.0],9679,5.0,2284,9539,37
4,5,"[3505, 26398, 1380, 16599, 27875, 2200, 1101]",[31563],"[70, 29, 41, 38, 82, 33]",[58],"[18764, 967, 761, 1547, 2170, 2447, 19781]",[10506],"[6.0, 4.0, 6.0, 4.0, 6.0, 6.0, 6.0]",[2.0],27369,6.0,761,26892,70


In [25]:
HT = {}

for col in numerical_sessionized_df.columns:
    try:
        for entry in numerical_sessionized_df[col]:
            
            if col not in HT:
                HT[col] = [len(entry)]
            else:
                HT[col].append(len(entry))
    except:
        HT[col] = 1

So later down the line I noticed that we can't call our mode for inferences with ragged tensors since their sizes can vary and the model needs fixed sizes.

In [26]:
for col in HT.keys():
    print(col, int(np.percentile(HT[col], 90)))

user_id 1
liked_books 34
disliked_books 4
liked_genres 18
disliked_genres 3
liked_authors 30
disliked_authors 4
liked_ratings 34
disliked_ratings 4
target_book 1
target_book_rating 1
authors 1
description 1
categories 1


We need to consider what *most* of our samples' lengths are and add in some extra space for future inferences with more information. Since in the future I plan on having feed back on every 10 books or so I'll allocate ~20 slots for each list just to be safe.

In [27]:
HT['liked_books'] = 20
HT['disliked_books'] = 20

HT['liked_genres'] = 20
HT['disliked_genres'] = 20

HT['liked_authors'] = 20
HT['disliked_authors'] = 20

HT['liked_ratings'] = 20
HT['disliked_ratings'] = 20

HT

{'user_id': 1,
 'liked_books': 20,
 'disliked_books': 20,
 'liked_genres': 20,
 'disliked_genres': 20,
 'liked_authors': 20,
 'disliked_authors': 20,
 'liked_ratings': 20,
 'disliked_ratings': 20,
 'target_book': 1,
 'target_book_rating': 1,
 'authors': 1,
 'description': 1,
 'categories': 1}

In [28]:
def pad_numpy_array(column, max_len, padding_value=0):
    return column.apply(lambda x: np.pad(x[:max_len], (0, max_len - min(len(x), max_len)), constant_values=padding_value))

def pad_column(column, max_len, padding_value=0):
    return column.apply(lambda x: (x[:max_len] if len(x) > max_len else x + [padding_value] * (max_len - len(x))))

for column, max_len in HT.items():
    first_entry = numerical_sessionized_df[column].iloc[0]
    
    if isinstance(first_entry, np.ndarray):
        numerical_sessionized_df[column] = pad_numpy_array(numerical_sessionized_df[column], max_len)
        
    elif isinstance(first_entry, list):
        numerical_sessionized_df[column] = pad_column(numerical_sessionized_df[column], max_len)
        

In [29]:
numerical_sessionized_df.head()

Unnamed: 0,user_id,liked_books,disliked_books,liked_genres,disliked_genres,liked_authors,disliked_authors,liked_ratings,disliked_ratings,target_book,target_book_rating,authors,description,categories
0,1,"[6305, 21182, 20922, 658, 5764, 13248, 6148, 1...","[23325, 1007, 14131, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[60, 50, 27, 29, 34, 68, 10, 52, 6, 33, 53, 0,...","[36, 52, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[9454, 12846, 4340, 4065, 4285, 458, 14942, 15...","[692, 10080, 16638, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[6.0, 6.0, 5.0, 5.0, 6.0, 6.0, 6.0, 5.0, 4.0, ...","[3.0, 3.0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",56223,5.0,39401,54887,82
1,2,"[491, 444, 1432, 48930, 11, 10097, 197, 33041,...","[48343, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[5, 41, 19, 6, 100, 13, 0, 0, 0, 0, 0, 0, 0, 0...","[9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[343, 69, 9, 7160, 34411, 183, 194, 147, 0, 0,...","[34000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, ...","[2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",607,6.0,420,583,6
2,3,"[859, 11445, 16211, 11151, 3102, 8855, 55148, ...","[41712, 16775, 21684, 15588, 27706, 33754, 0, ...","[71, 23, 26, 40, 60, 29, 1, 98, 6, 53, 65, 0, ...","[14, 73, 1, 98, 57, 90, 0, 0, 0, 0, 0, 0, 0, 0...","[7930, 593, 3621, 11559, 8144, 6283, 29217, 17...","[19662, 23940, 11967, 1245, 1767, 11111, 0, 0,...","[4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 0, 0, 0, 0, 0, ...",45074,5.0,31736,44138,29
3,4,"[20216, 61316, 61811, 10629, 1880, 4407, 4964,...","[8057, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[79, 80, 17, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[14407, 2560, 42849, 1317, 2284, 0, 0, 0, 0, 0...","[1681, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.0, 6.0, 6.0, 4.0, 6.0, 6.0, 6.0, 0, 0, 0, 0...","[3.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",9679,5.0,2284,9539,37
4,5,"[3505, 26398, 1380, 16599, 27875, 2200, 1101, ...","[31563, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[70, 29, 41, 38, 82, 33, 0, 0, 0, 0, 0, 0, 0, ...","[58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[18764, 967, 761, 1547, 2170, 2447, 19781, 0, ...","[10506, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[6.0, 4.0, 6.0, 4.0, 6.0, 6.0, 6.0, 0, 0, 0, 0...","[2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",27369,6.0,761,26892,70


In [31]:
num_train_df, num_test_df = train_test_split(numerical_sessionized_df, test_size=0.3, random_state=42)

In [32]:
num_test_df.head()

Unnamed: 0,user_id,liked_books,disliked_books,liked_genres,disliked_genres,liked_authors,disliked_authors,liked_ratings,disliked_ratings,target_book,target_book_rating,authors,description,categories
12217,12218,"[31971, 25199, 33441, 6756, 5111, 7647, 45010,...","[22758, 41342, 35391, 5333, 14016, 35968, 4504...","[45, 81, 57, 34, 56, 30, 62, 26, 17, 39, 42, 1...","[30, 77, 14, 29, 1, 41, 84, 39, 6, 44, 9, 13, ...","[11605, 31693, 203, 3480, 10202, 14724, 8219, ...","[26439, 464, 8691, 39723, 7036, 13069, 2809, 2...","[4.0, 4.0, 4.0, 6.0, 6.0, 6.0, 4.0, 5.0, 5.0, ...","[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...",12991,4.0,9261,12793,7
169,170,"[22411, 700, 5840, 63, 26701, 16590, 12878, 74...","[7074, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[48, 5, 70, 12, 66, 10, 6, 100, 78, 28, 0, 0, ...","[10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[16000, 2286, 4121, 475, 44, 9185, 517, 5373, ...","[5014, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 6.0, 5.0, ...","[2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",22052,6.0,10690,21701,48
2099,2100,"[443, 442, 354, 1191, 2908, 4488, 3, 0, 0, 0, ...","[285, 2897, 558, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[30, 3, 2, 48, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[2, 9, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1211, 313, 3, 56, 206, 82, 0, 0, 0, 0, 0, 0, ...","[388, 206, 2020, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[6.0, 6.0, 5.0, 6.0, 5.0, 5.0, 6.0, 0, 0, 0, 0...","[3.0, 3.0, 2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",105,5.0,82,101,6
4948,4949,"[682, 1061, 20025, 40, 253, 408, 0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 20, 9, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[32, 187, 733, 14267, 289, 470, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.0, 4.0, 4.0, 6.0, 6.0, 6.0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",470,6.0,333,449,30
5771,5772,"[380, 37443, 858, 8254, 16387, 927, 8476, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[16, 20, 4, 68, 77, 55, 0, 0, 0, 0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[11681, 636, 592, 267, 3826, 5841, 6011, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6.0, 6.0, 6.0, 6.0, 4.0, 6.0, 6.0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",43443,6.0,267,42562,55


## Save need data files

In [33]:
print("🚨 Starting dataset creation...")

num_train_ds = tf.data.Dataset.from_tensor_slices({
    'user_id': tf.constant(num_train_df['user_id'].tolist(), dtype=tf.int64),
    'liked_books': tf.constant(num_train_df['liked_books'].tolist(), dtype=tf.int64),
    'disliked_books': tf.constant(num_train_df['disliked_books'].tolist(), dtype=tf.int64),
    'liked_genres': tf.constant(num_train_df['liked_genres'].tolist(), dtype=tf.int64),
    'disliked_genres': tf.constant(num_train_df['disliked_genres'].tolist(), dtype=tf.int64),
    'liked_authors': tf.constant(num_train_df['liked_authors'].tolist(), dtype=tf.int64),
    'disliked_authors': tf.constant(num_train_df['disliked_authors'].tolist(), dtype=tf.int64),
    'liked_ratings': tf.constant(num_train_df['liked_ratings'].tolist(), dtype=tf.float32),
    'disliked_ratings': tf.constant(num_train_df['disliked_ratings'].tolist(), dtype=tf.float32),
    'target_book': tf.constant(num_train_df['target_book'], dtype=tf.int64),
    'authors': tf.constant(num_train_df['authors'], dtype=tf.int64),
    'description': tf.constant(num_train_df['description'], dtype=tf.int64),
    'categories': tf.constant(num_train_df['categories'], dtype=tf.int64),
    'target_book_rating': tf.constant(num_train_df['target_book_rating'], dtype=tf.float32),
})

num_test_ds = tf.data.Dataset.from_tensor_slices({
    'user_id': tf.constant(num_test_df['user_id'].tolist(), dtype=tf.int64),
    'liked_books': tf.constant(num_test_df['liked_books'].tolist(), dtype=tf.int64),
    'disliked_books': tf.constant(num_test_df['disliked_books'].tolist(), dtype=tf.int64),
    'liked_genres': tf.constant(num_test_df['liked_genres'].tolist(), dtype=tf.int64),
    'disliked_genres': tf.constant(num_test_df['disliked_genres'].tolist(), dtype=tf.int64),
    'liked_authors': tf.constant(num_test_df['liked_authors'].tolist(), dtype=tf.int64),
    'disliked_authors': tf.constant(num_test_df['disliked_authors'].tolist(), dtype=tf.int64),
    'liked_ratings': tf.constant(num_test_df['liked_ratings'].tolist(), dtype=tf.float32),
    'disliked_ratings': tf.constant(num_test_df['disliked_ratings'].tolist(), dtype=tf.float32),
    'target_book': tf.constant(num_test_df['target_book'], dtype=tf.int64),
    'authors': tf.constant(num_test_df['authors'], dtype=tf.int64),
    'description': tf.constant(num_test_df['description'], dtype=tf.int64),
    'categories': tf.constant(num_test_df['categories'], dtype=tf.int64),
    'target_book_rating': tf.constant(num_test_df['target_book_rating'], dtype=tf.float32),
})

print("✅ Datasets successfully created!")


🚨 Starting dataset creation...
✅ Datasets successfully created!


In [34]:
for example in num_train_ds.take(1):

    print(example)

    break

{'user_id': <tf.Tensor: shape=(), dtype=int64, numpy=3131>, 'liked_books': <tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  423,  2903,   526, 30042, 21932,  9074,   209,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0])>, 'disliked_books': <tf.Tensor: shape=(20,), dtype=int64, numpy=
array([29071, 34391,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0])>, 'liked_genres': <tf.Tensor: shape=(20,), dtype=int64, numpy=
array([26, 60, 81, 58, 52,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])>, 'disliked_genres': <tf.Tensor: shape=(20,), dtype=int64, numpy=
array([36, 53,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])>, 'liked_authors': <tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  366, 21297,  6445,   188, 15652,   155,  2022,     0,     0,
           0,     0,     0,     0,    

In [35]:
num_train_ds_limited = num_train_ds#.take(500)  # Limit to 1000 samples
num_test_ds_limited = num_test_ds#.take(500)  # Limit to 500 samples

num_train_ds_cached = num_train_ds_limited.batch(128).cache()#.batch(128).cache()
num_test_ds_cached = num_test_ds_limited.batch(128).cache()

## Build Twin-Tower Model

Compared to the original version of our twin tower model, I want to simplify the input scheme to the user and book towers and incorporate some additional features for user embedding. Furthermore, we want to pass in sessionized user data separately from book data to each tower.

In [36]:
import tensorflow_recommenders as tfrs
import tensorflow.keras.layers as layers
from typing import Dict
import time

import boto3
import io
import sagemaker

class BooksTwoTowersModel(tfrs.Model):
    def __init__(self, user_data: pd.DataFrame, book_metadata: pd.DataFrame, embedding_dimensions=256):
        super().__init__()

        self.dense_projection_user = tf.keras.layers.Dense(embedding_dimensions, name='user_dense_projection')
        
        self.dense_projection_book = tf.keras.layers.Dense(embedding_dimensions, name='book_dense_projection')

        self.user_model = UserModel(user_data, book_metadata, self.dense_projection_user, embedding_dimensions)

        self.book_model = BookModel(book_metadata, embedding_dimensions, 10000, self.dense_projection_book)

        self.candidate_ds = tf.data.Dataset.from_tensor_slices({
            'title': tf.convert_to_tensor(book_metadata['title'].values, dtype=tf.int64),
            'authors': tf.convert_to_tensor(book_metadata['authors'].values, dtype=tf.int64),
            'description': tf.convert_to_tensor(book_metadata['description'].values, dtype=tf.int64),
            'categories': tf.convert_to_tensor(book_metadata['categories'].values, dtype=tf.int64)
        })

        candidates = self.candidate_ds.batch(1).map(
            lambda x: self.book_model(x), num_parallel_calls=tf.data.AUTOTUNE
        ).map(
            lambda x: tf.squeeze(x, axis=0)
        )

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=candidates.batch(1),
                ks=(10, 20, 50)
            )
        )

        self.full_book_embeddings = None
        self.full_book_embeddings_copy = None

    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:

        user_embeddings = self.user_model(features)

        target_book_embeddings = self.book_model(features)

        retrieval_loss = self.task(user_embeddings, target_book_embeddings, compute_metrics=not training)

        return retrieval_loss
    

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


### Book and User model

In [37]:
class BookModel(tf.keras.Model):
    '''
    The book(query) tower that processes book data.
    '''

    def __init__(self, book_data: pd.DataFrame, embedding_dimensions: int, text_vectorization_max_tokens: int, dense_projection_book): #, book_title_weight_layer, book_author_weight_layer, book_genre_weight_layer):
        '''
        :param book_data: DataFrame containing book information.
        :param embedding_dimensions: Number of dimensions in embedding layer.
        :param text_vectorization_max_tokens: Maximum number of tokens to vector.
        '''
        super().__init__()

        # Extract unique values for embeddings
        self.feature_book_title_name = "title"
        self.feature_author_name = "authors"
        self.feature_genre_name = "categories"

        unique_titles = book_data[self.feature_book_title_name].astype(str).unique()
        unique_authors = book_data[self.feature_author_name].astype(str).unique()
        unique_genres = book_data[self.feature_genre_name ].astype(str).unique()

        self.dense_projection_book = dense_projection_book
        
        # Book Title embedding
        self.book_title_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_titles) + 1, output_dim=embedding_dimensions, name='book_title_embedding')

        # Book Author embedding
        self.book_author_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_authors) + 1, output_dim=embedding_dimensions, name='book_author_embedding')

        # Book Genere embedding
        self.book_genre_emdedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_genres) + 1, output_dim=embedding_dimensions, name='book_genre_embedding')
    
        # print("Finsihed setting up book tower\n")

    def call(self, book_data: Dict[str, tf.Tensor]) -> tf.Tensor:
        
        # Handle case where 'target_book' might not exist
        try:
            if len(book_data['target_book'].shape) == 0:
                book_data['target_book'] = tf.expand_dims(book_data['target_book'], axis=0)
            
            book_title_embed = self.book_title_embedding_layers(book_data['target_book'])
        except KeyError:
            if len(book_data['title'].shape) == 0:
                book_data['title'] = tf.expand_dims(book_data['title'], axis=0)
                
            book_title_embed = self.book_title_embedding_layers(book_data['title'])
        
        if len(book_data['authors'].shape) == 0:
            book_data['authors'] = tf.expand_dims(book_data['authors'], axis=0)
            book_data['categories'] = tf.expand_dims(book_data['categories'], axis=0)
            
        book_author_embed = self.book_author_embedding_layers(book_data['authors'])
        book_genre_embed = self.book_genre_emdedding_layers(book_data['categories'])
        
        # Concatenation without expand_dims
        concatenated_embeddings = tf.concat([
            book_title_embed,
            book_author_embed,
            book_genre_embed
            ], axis=-1)  # Use last axis for feature concat
    
        # Apply projection to 64D embedding
        projected_embeddings = self.dense_projection_book(concatenated_embeddings)
    
        return projected_embeddings

In [38]:
class UserModel(tf.keras.Model):
    def __init__(self, user_data: pd.DataFrame, book_metadata: pd.DataFrame, dense_projection_user, embedding_dimensions=64):
        super().__init__()

        # Extract unique values from user and book metadata
        unique_user_ids = user_data['user_id'].astype(str).unique().tolist()
        unique_book_titles = book_metadata['title'].astype(str).unique().tolist()
        unique_genres = book_metadata['categories'].astype(str).unique().tolist()
        unique_authors = book_metadata['authors'].astype(str).unique().tolist()

        self.dense_projection_user = dense_projection_user
        
        # User embedding
        self.user_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_user_ids) + 1, output_dim=embedding_dimensions, name='user_id_embedding')

        # Book embedding
        self.book_title_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_book_titles) + 1, output_dim=embedding_dimensions, name='book_embedding')

        # Genre embedding
        self.genre_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_genres) + 1, output_dim=embedding_dimensions, name='genre_embedding')

        # Author embedding
        self.author_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_authors) + 1, output_dim=embedding_dimensions, name='author_embedding')
        
        # print("Finsihed setting up user tower\n")
    
    def call(self, inputs):
        
        try:
            
            if len(inputs['user_id']) > 1:
                # tf.print("Entered regular path")
                pass
            else:
                x = 1/0
            
        except:

            temp = inputs
            
            try:
                inputs = {
                    'user_id': tf.expand_dims(temp[0][0], axis = 0),
                    'liked_books': tf.expand_dims(temp[1], axis = 0),
                    'disliked_books': tf.expand_dims(temp[2], axis = 0),
                    'liked_genres': tf.expand_dims(temp[3], axis = 0),
                    'disliked_genres': tf.expand_dims(temp[4], axis = 0),
                    'liked_authors': tf.expand_dims(temp[5], axis = 0),
                    'disliked_authors': tf.expand_dims(temp[6], axis = 0),
                    'liked_ratings': tf.expand_dims(temp[7], axis = 0),
                    'disliked_ratings': tf.expand_dims(temp[8], axis = 0)
                }
            except:
                temp = [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

                inputs = {
                    'user_id': tf.expand_dims(temp[0][0], axis = 0),
                    'liked_books': tf.expand_dims(temp[1], axis = 0),
                    'disliked_books': tf.expand_dims(temp[2], axis = 0),
                    'liked_genres': tf.expand_dims(temp[3], axis = 0),
                    'disliked_genres': tf.expand_dims(temp[4], axis = 0),
                    'liked_authors': tf.expand_dims(temp[5], axis = 0),
                    'disliked_authors': tf.expand_dims(temp[6], axis = 0),
                    'liked_ratings': tf.expand_dims(temp[7], axis = 0),
                    'disliked_ratings': tf.expand_dims(temp[8], axis = 0)
                }
        
        user_embed = self.user_embedding_layers(inputs['user_id'])

        def pool_embeddings(embedding_layer, input_list, weights, embedding_dim=64, pad_value=0):
            # Get embeddings
            embeddings = embedding_layer(input_list)
        
            # Create mask
            mask = tf.not_equal(input_list, pad_value)
            mask = tf.expand_dims(mask, axis=-1)
        
            # Zero out padded embeddings
            embeddings = tf.where(mask, embeddings, tf.zeros_like(embeddings))
        
            # Normalize weights (zero-safe)
            weight_sum = tf.reduce_sum(weights, axis=-1, keepdims=True)
            weight_sum = tf.where(weight_sum == 0, tf.ones_like(weight_sum), weight_sum)
            weights = weights / weight_sum
            
            # Expand weights dims
            expanded_weights = tf.expand_dims(weights, axis=-1)
            
            # Weighted Embeddings
            weighted_embeddings = embeddings * expanded_weights
        
            # Sum + Pool
            summed_embeddings = tf.reduce_sum(weighted_embeddings, axis=1)
            valid_counts = tf.reduce_sum(tf.cast(mask, tf.float32), axis=1)
            valid_counts = tf.where(valid_counts == 0, tf.ones_like(valid_counts), valid_counts)
            pooled_embeddings = summed_embeddings / valid_counts
        
            # Fix NaNs
            pooled_embeddings = tf.where(tf.math.is_nan(pooled_embeddings), tf.zeros_like(pooled_embeddings), pooled_embeddings)
        
            return pooled_embeddings


        
        # Process liked books
        liked_books_embed = pool_embeddings(self.book_title_embedding_layers, inputs['liked_books'], inputs['liked_ratings'])

        # Process disliked books
        # disliked_books_embed = pool_embeddings(self.book_title_embedding_layers, inputs['disliked_books'], inputs['disliked_ratings'])

        # Process liked genres
        liked_genres_embed = pool_embeddings(self.genre_embedding_layers, inputs['liked_genres'], inputs['liked_ratings'])

        # Process disliked genres
        # disliked_genres_embed = pool_embeddings(self.genre_embedding_layers, inputs['disliked_genres'], inputs['disliked_ratings'])

        # Process liked authors
        liked_authors_embed = pool_embeddings(self.author_embedding_layers, inputs['liked_authors'], inputs['liked_ratings'])

        # Process disliked authors
        # disliked_authors_embed = pool_embeddings(self.author_embedding_layers, inputs['disliked_authors'], inputs['disliked_ratings'])

        # Concatenate everything into a single user representation
        try:
            concatenated_embeddings = tf.concat([
                user_embed,
                liked_books_embed,
                # disliked_books_embed,
                liked_genres_embed,
                # disliked_genres_embed,
                liked_authors_embed,
                # disliked_authors_embed
            ], axis=1)
        except:
            return inputs

        projected_embeddings = self.dense_projection_user(concatenated_embeddings)

        # print(f"projected_embeddings.shape: {projected_embeddings.shape}\n")
        
        return projected_embeddings

In [39]:
# Compile Model

# Enable Multi-GPU Training
strategy = tf.distribute.MirroredStrategy()
print(f"Number of GPUs being used: {strategy.num_replicas_in_sync}")

tf.config.run_functions_eagerly(True)

with strategy.scope():
    model = BooksTwoTowersModel(
        user_data=numerical_sessionized_df,
        book_metadata=numerical_books_df,
        embedding_dimensions=64, # Change this for embedding sizes to change (64 default val)
    )
    model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of GPUs being used: 1




In [40]:
model.fit(num_train_ds_cached, epochs=7)

Epoch 1/7


2025-04-14 00:39:52.879084: W tensorflow/core/framework/dataset.cc:993] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


Epoch 2/7
 1/87 [..............................] - ETA: 5s - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_20_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - loss: 606.6939 - regularization_loss: 0.0000e+00 - total_loss: 606.6939

2025-04-14 00:39:58.705267: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


Epoch 3/7
 1/87 [..............................] - ETA: 5s - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_20_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - loss: 343.1006 - regularization_loss: 0.0000e+00 - total_loss: 343.1006

2025-04-14 00:40:04.309310: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


Epoch 4/7
Epoch 5/7
 1/87 [..............................] - ETA: 5s - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_20_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - loss: 28.5067 - regularization_loss: 0.0000e+00 - total_loss: 28.5067

2025-04-14 00:40:15.661676: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


Epoch 6/7
Epoch 7/7
 2/87 [..............................] - ETA: 5s - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_20_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - loss: 2.0063 - regularization_loss: 0.0000e+00 - total_loss: 2.0063

2025-04-14 00:40:27.152387: W tensorflow/core/framework/dataset.cc:993] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.




<tf_keras.src.callbacks.History at 0x7fc1a2e8ca90>

See if you can get recommendations directly with filters

## Save model.user_model

In [None]:
# import tensorflow as tf
# import sys

# # Create a function to explicitly route tf.print to stdout
# # def tf_print_to_stdout(msg):
# #     tf.print(msg, output_stream=sys.stdout)

# #tf_print_to_stdout("Saving model now...")
# model.user_model.save('model.tar.gz')

In [None]:
# Convert model.user_model to .tar.gz type file. 
# Pack model and artifacts into tar.gz
import tarfile
import os

#tarfile_name = "model.tar.gz"
model.user_model.save("export/Servo/1")
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("export")

In [None]:
# Initialize S3 client
import boto3
import subprocess
import os
import pickle
import joblib
import tarfile
import shutil
import sagemaker

role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
s3 = boto3.client('s3')

In [None]:
# Save model.tar.gz to required S3 bucket
# s3://w210recsys/testModels/model.tar.gz
bucket_name="w210recsys"

## Real Model
# key_prefix="model/recModel"

# Test Model
key_prefix = "testModels"
s3_response = sm_session.upload_data("model.tar.gz", bucket=bucket_name, key_prefix=key_prefix)

### Model Prediction Pipeline

In [88]:
# ['Fiction / Mystery & Detective', 'Young Adult Fiction / General',
#        'Drama / General', 'Juvenile Fiction / Fantasy & Magic',
#        'Juvenile Fiction / General', 'Fiction / World Literature',
#        'Fiction / Romance', 'Political Science / General',
#        'Fiction / Literary', 'Business & Economics / General',
#        'Juvenile Fiction / Legends, Myths, Fables',
#        'Juvenile Fiction / Science Fiction',
#        'Fiction / Fairy Tales, Folk Tales, Legends & Mythology',
#        'Fiction / Science Fiction', 'Fiction / Classics',
#        'Religion / General', 'Fiction / General', 'Fiction / Ghost',
#        'Fiction / Action & Adventure', 'Juvenile Nonfiction / General',
#        'Juvenile Fiction / Fairy Tales & Folklore',
#        'Fiction / War & Military', 'History / Maritime History & Piracy',
#        'Juvenile Fiction / Thrillers & Suspense',
#        'Comics & Graphic Novels / Superheroes',
#        'Literary Criticism / General', 'Science / General',
#        'Reference / General', 'History / General',
#        'Fiction / Occult & Supernatural', 'Philosophy / General',
#        'Computers / General',
#        'Biography & Autobiography / Personal Memoirs', 'Art / General',
#        'Fiction / Visionary & Metaphysical',
#        'Family & Relationships / General', 'Fiction / Thrillers',
#        'Health & Fitness / General', 'Fiction / Anthologies',
#        'Biography & Autobiography / General', 'Fiction / Sea Stories',
#        'Fiction / Erotica', 'Fiction / Sagas',
#        'Fiction / Magical Realism', 'Fiction / Biographical',
#        'History / Expeditions & Discoveries', 'Education / General',
#        'Juvenile Fiction / Nursery Rhymes', 'Humor / Topic',
#        'Nature / General', 'True Crime / Murder', 'Psychology / General',
#        'Social Science / General', 'Photography / General',
#        'Religion / Theology', 'Fiction / Dystopian',
#        'History / Wars & Conflicts', 'Body, Mind & Spirit / General',
#        'Fiction / Short Stories', 'History / Social History',
#        'Games & Activities / General', 'Fiction / Family Life',
#        'Comics & Graphic Novels / General', 'Fiction / City Life',
#        'Biography & Autobiography / Literary Figures',
#        'Juvenile Fiction / Short Stories', 'Fiction / Crime',
#        'Travel / Essays & Travelogues',
#        'Technology & Engineering / General', 'Drama / Shakespeare',
#        'History / Historiography', 'Bibles / General',
#        'History / Indigenous Peoples of the Americas',
#        'Cooking / Individual Chefs & Restaurants',
#        'Performing Arts / General', 'Fiction / Noir', 'Poetry / General',
#        'History / Military', 'Cooking / General', 'Travel / General',
#        'Music / General', 'Sports & Recreation / General',
#        'True Crime / General', 'Religion / Christian Theology',
#        'Language Arts & Disciplines / General',
#        'Crafts & Hobbies / General', 'Pets / General',
#        'Young Adult Nonfiction / General', 'House & Home / General',
#        'Literary Collections / General', 'Humor / General',
#        'Antiques & Collectibles / General', 'Study Aids / General',
#        'Foreign Language Study / General', 'Medical / General',
#        'Law / General', 'Mathematics / General',
#        'History / Historical Geography', 'Architecture / General',
#        'Transportation / General', 'Gardening / General',
#        'Design / General']

# Profile one
liked_genres = ['Mathematics / General', 'Computers / General']
disliked_genres = ['History / Military']

# Profile two
# liked_genres = ['Religion / General' [ 'Medical / General']
# disliked_genres = [] # base has some 'Family & Relationships / General' books
# disliked_genres = ['Family & Relationships / General'] # Commenting this in actually gave more of these as recs????

# Profile three
# liked_genres = ['Law / General', 'True Crime / General']
# disliked_genres = []

# Profile four
# liked_genres = ['Performing Arts / General', 'Humor / Topic']
# disliked_genres = []

# Profile five (Popular)
# liked_genres = ['Religion / General'] # ['Fiction / World Literature', 
# disliked_genres = []

# Profile five (Un-Popular)
# liked_genres = ['Study Aids / General', 'Young Adult Nonfiction / General']
# disliked_genres = []

# Profile six (Un-Popular)
# liked_genres = ['Nature / General', ' Health & Fitness / General']
# disliked_genres = []


In [89]:
sampled_books = books_df[books_df['genre_consolidated'].isin(liked_genres)].sample(5)

sampled_books

Unnamed: 0,user_id,book_id,title,author,publish_year,description,preview_link,normalized_popularity,genre_general,genre_specific,genre_combined,genre_consolidated,review_helpfulness,review_score,review_time,review_summary,review_text
14568,A1RAFRNWOWMC3D,83,The Notebook,Jean Andrews,2013,"This step-by-step, highly visual text provides...",http://books.google.nl/books?id=bUILAAAAQBAJ&p...,0.787122,Computers,Hardware,Computers / Hardware,Computers / General,0,5.0,2002-10-28,The Power of Love,The Notebook is the love story of Noah and All...
274404,A13586UCKKITJZ,35697,Learning SQL: A Step-By-Step Guide Using Oracle,Richard W. Earp,2002,Step-by-step examples and exercises help reade...,http://books.google.nl/books?id=706LQAAACAAJ&d...,0.15715,Computers,Database Administration & Management,Computers / Database Administration & Management,Computers / General,12,5.0,2002-07-02,A Wonderful Book That Clearly Explains Unclear...,Despite the pile Oracle Certified Professional...
186922,A1V0MM7VDPMPU8,9039,Introductory Real Analysis (Dover Books on Mat...,A. N. Kolmogorov,1975,"Comprehensive, elementary introduction to real...",http://books.google.com/books?id=U_FIAwAAQBAJ&...,0.33105,Mathematics,Functional Analysis,Mathematics / Functional Analysis,Mathematics / General,8,1.0,2008-07-09,DON'T buy this book!,This is not the original work. Just like Emily...
141138,A1PRZY9391ES5F,3985,Learning Java (The Java Series),Jamie Chan,2016,"(2018 Edition, Updated for Netbeans 9.0) Learn...",http://books.google.nl/books?id=GohfvwEACAAJ&d...,0.424547,Computers,Programming,Computers / Programming,Computers / General,79,2.0,2002-10-19,Not a tutorial and not for new programmers,"I had purchased ""Learning Java"" out of the con..."
207113,A2K0JZN74WFCYO,12575,SQL Server 2005 T-SQL Recipes: A Problem-Solut...,Joseph Sack,2006,"* Comprehensive T-SQL Coverage, including all ...",http://books.google.com/books?id=5_AEiJXbyiEC&...,0.294963,Computers,Database Administration & Management,Computers / Database Administration & Management,Computers / General,5,5.0,2006-05-05,No BS type of book,This book is a very good book for updating ski...


In [95]:
# Encode each book

sampled_encoded_liked_title = book_title_vocab_layer(sampled_books['title']).numpy()
sampled_encoded_liked_authors = book_authors_vocab_layer(sampled_books['author']).numpy()
sampled_encoded_liked_genres = book_genre_vocab_layer(sampled_books['genre_consolidated']).numpy()

sampled_encoded_disliked_title = np.array([], dtype='int64')
sampled_encoded_disliked_authors = np.array([], dtype='int64')
sampled_encoded_disliked_genres = np.array([], dtype='int64')

sampled_liked_rating = [6 for title in sampled_encoded_liked_title]

encoded_disliked_genres = []

for genre in disliked_genres:
    encoded_disliked_genres.append(book_genre_vocab_layer(genre).numpy())
    
disliked_ratings = [1 for title in encoded_disliked_genres]

# sampled_encoded_liked_title, sampled_encoded_liked_authors, sampled_encoded_liked_genres, sampled_liked_rating, encoded_disliked_genres, disliked_ratings


In [96]:
sampled_encoded_liked_title

array([   84, 35698,  9040,  3986, 12576])

In [97]:
sampled_encoded_disliked_title 

array([], dtype=int64)

In [98]:
sample_user_info = ({
    'user_id': [0], # Doesn't matter
    'liked_books': sampled_encoded_liked_title.tolist(),
    'disliked_books': [],

    'liked_genres': sampled_encoded_liked_genres.tolist(),
    'disliked_genres': encoded_disliked_genres,
    
    'liked_authors': sampled_encoded_liked_authors.tolist(),
    'disliked_authors':[],
    
    'liked_ratings': sampled_liked_rating,
    'disliked_ratings': disliked_ratings,
})

sample_user = []

for col in sample_user_info:

    # print(col, sample_user_info[col])
    
    sample_user_info[col].extend([0]*(20 - len(sample_user_info[col])))

    # print(sample_user_info[col])
    
    sample_user.append(sample_user_info[col])

sample_user = tf.cast(sample_user, tf.float32)

In [99]:
# model.user_model.predict([[i for i in range(20)], [i for i in range(20)], [i for i in range(20)]])

user_embedding = model.user_model.predict(sample_user)

# user_embedding



2025-04-14 01:15:32.503136: W tensorflow/core/framework/dataset.cc:993] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.
2025-04-14 01:15:32.522531: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]


Extract the books' embeddings

In [60]:
book_tower = model.book_model

book_dataset = tf.data.Dataset.from_tensor_slices({
    'target_book': tf.constant(numerical_books_df['title'].tolist(), dtype=tf.int64),
    'authors': tf.constant(numerical_books_df['authors'].tolist(), dtype=tf.int64),
    'categories': tf.constant(numerical_books_df['categories'].tolist(), dtype=tf.int64),
    'description': tf.constant(numerical_books_df['description'].tolist(), dtype=tf.int64),
}).batch(128)  # Optional batching

book_embeddings = []

for batch in book_dataset:
    batch_embeddings = book_tower(batch)
    book_embeddings.append(batch_embeddings)

book_embeddings = tf.concat(book_embeddings, axis=0)
print("Book Embeddings Shape:", book_embeddings.shape)

books_df_unique['Embeddings'] = [embed.numpy() for embed in book_embeddings]

books_df_unique.shape

Book Embeddings Shape: (71696, 64)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(71696, 18)

In [61]:
books_df_unique.columns

Index(['user_id', 'book_id', 'title', 'author', 'publish_year', 'description',
       'preview_link', 'normalized_popularity', 'genre_general',
       'genre_specific', 'genre_combined', 'genre_consolidated',
       'review_helpfulness', 'review_score', 'review_time', 'review_summary',
       'review_text', 'Embeddings'],
      dtype='object')

In [100]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

filtered_books_df_unique = books_df_unique

book_embeddings = np.array(filtered_books_df_unique['Embeddings'].tolist())

# Normalize user embedding to unit vector (L2 normalization)
# user_embedding /= np.linalg.norm(user_embedding)

# Normalize all book embeddings to unit vectors (L2 normalization)
# book_embeddings /= np.linalg.norm(book_embeddings, axis=1, keepdims=True)

# Calculate the cosine similarity between user_embedding and all book embeddings
cos_similarities = cosine_similarity(user_embedding.reshape(1, -1), book_embeddings)

# Get the indices of the top 6 closest books
top_k_indices = np.argsort(cos_similarities[0])[-40:][::-1]  # Top 6 indices with highest similarity

# Print the results (book indices and cosine similarity scores)
for i, idx in enumerate(top_k_indices):

    print(f"{filtered_books_df_unique.iloc[idx]['title']} | {filtered_books_df_unique.iloc[idx]['genre_consolidated']}")
    # print(f"Recommendation {i+1}: Book Index {idx} (Cosine Similarity: {cos_similarities[0][idx]:.4f})")


Murach's ASP.NET 2.0 Web Programming with VB 2005 | Computers / General
CFS traced to childhood trauma, emotional instability, stress.(Across Specialties)(chronic fatigue syndrome): An article from: Clinical Psychiatry News | Medical / General
Java Design Patterns: A Tutorial | Computers / General
Enterprise Messaging Using JMS and IBM WebSphere | Computers / General
Blues Harmonica Collection | Music / General
GMAT for Dummies | Study Aids / General
Huntington Beach (CA) (Images of America) | History / Expeditions & Discoveries
Photoshop for Right-Brainers: The Art of Photo Manipulation | Computers / General
Sams Teach Yourself CSS in 24 Hours (Sams Teach Yourself...in 24 Hours) | Computers / General
The Invisible Computer: Why Good Products Can Fail, the Personal Computer Is So Complex, and Information Appliances Are the Solution | Computers / General
Little Digital Video Book | Computers / General
What Just Happened: A Chronicle from the Information Frontier | Computers / General
St

Mimic subsequent requests

In [101]:
# Map recs to each category

# Murach's ASP.NET 2.0 Web Programming with VB 2005 | Computers / General
# CFS traced to childhood trauma, emotional instability, stress.(Across Specialties)(chronic fatigue syndrome): An article from: Clinical Psychiatry News | Medical / General
# Enterprise Messaging Using JMS and IBM WebSphere | Computers / General
# GMAT for Dummies | Study Aids / General
# Java Design Patterns: A Tutorial | Computers / General
# Huntington Beach (CA) (Images of America) | History / Expeditions & Discoveries

sub_liked_books = [
    "Enterprise Messaging Using JMS and IBM WebSphere",
    "Murach's ASP.NET 2.0 Web Programming with VB 2005"
]
sub_disliked_books = [
    "CFS traced to childhood trauma, emotional instability, stress.(Across Specialties)(chronic fatigue syndrome): An article from: Clinical Psychiatry News"
]

# Next, extract the encoded metadata of each book

sub_book_titles = book_title_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['title']).numpy()
sampled_encoded_liked_title = np.concatenate([sampled_encoded_liked_title, sub_book_titles])

sub_book_authors = book_authors_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['author']).numpy()
sampled_encoded_liked_authors = np.concatenate([sampled_encoded_liked_authors, sub_book_authors])

sub_book_genres = book_genre_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['genre_consolidated']).numpy()
sampled_encoded_liked_genres = np.concatenate([sampled_encoded_liked_genres, sub_book_genres])

#####

sub_book_titles_dis = book_title_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_disliked_books)]['title']).numpy()
sampled_encoded_disliked_title = np.concatenate([sampled_encoded_disliked_title, sub_book_titles_dis])

sub_book_authors_dis = book_authors_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_disliked_books)]['author']).numpy()
sampled_encoded_disliked_authors = np.concatenate([sampled_encoded_disliked_authors, sub_book_authors_dis])

sub_book_genres_dis = book_genre_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_disliked_books)]['genre_consolidated']).numpy()
sampled_encoded_disliked_genres = np.concatenate([sampled_encoded_disliked_genres, sub_book_genres_dis])

####

# sampled_encoded_disliked_title.extend(book_title_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['title']).numpy()
# sampled_encoded_disliked_authors.extend(book_authors_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['author']).numpy()
# sampled_encoded_disliked_genres.extend(book_genre_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['genre_consolidated']).numpy()

sampled_liked_rating = [6 for title in sampled_encoded_liked_title]
    
sampled_disliked_ratings = [1 for title in sampled_encoded_disliked_title]

# sampled_encoded_liked_title, sampled_encoded_liked_authors, sampled_encoded_liked_genres, sampled_encoded_disliked_title,sampled_encoded_disliked_authors, sampled_encoded_disliked_genres, sampled_liked_rating, sampled_disliked_ratings

In [102]:
sampled_encoded_liked_title

array([   84, 35698,  9040,  3986, 12576,  5113, 25534])

In [104]:
sampled_encoded_disliked_title

array([62629])

In [105]:
sample_user_info = ({
    'user_id': [0], # Doesn't matter
    'liked_books': sampled_encoded_liked_title.tolist(),
    'disliked_books': sampled_encoded_disliked_title.tolist(),

    'liked_genres': sampled_encoded_liked_genres.tolist(),
    'disliked_genres': sampled_encoded_disliked_genres.tolist(),
    
    'liked_authors': sampled_encoded_liked_authors.tolist(),
    'disliked_authors': sampled_encoded_disliked_authors.tolist(),
    
    'liked_ratings': sampled_liked_rating,
    'disliked_ratings': sampled_disliked_ratings
})

sample_user = []

for col in sample_user_info:

    # print(col, sample_user_info[col])
    
    sample_user_info[col].extend([0]*(20 - len(sample_user_info[col])))

    # print(sample_user_info[col])
    
    sample_user.append(sample_user_info[col])

sample_user = tf.cast(sample_user, tf.float32)

In [106]:
# model.user_model.predict([[i for i in range(20)], [i for i in range(20)], [i for i in range(20)]])

user_embedding = model.user_model.predict(sample_user)

# user_embedding



2025-04-14 01:17:13.748405: W tensorflow/core/framework/dataset.cc:993] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


In [108]:
books_df_unique.shape, filtered_df.shape

((71696, 18), (71693, 18))

In [107]:
excluded_books = sub_liked_books
excluded_books.extend(sub_disliked_books)

# Assuming your DataFrame is called df and the column containing books is 'book_id'
filtered_df = books_df_unique[~books_df_unique['title'].isin(excluded_books)]

book_embeddings = np.array(filtered_books_df_unique['Embeddings'].tolist()) 

# Normalize user embedding to unit vector (L2 normalization)
# user_embedding /= np.linalg.norm(user_embedding)

# Normalize all book embeddings to unit vectors (L2 normalization)
# book_embeddings /= np.linalg.norm(book_embeddings, axis=1, keepdims=True)

# Calculate the cosine similarity between user_embedding and all book embeddings
cos_similarities = cosine_similarity(user_embedding.reshape(1, -1), book_embeddings)

# Get the indices of the top 6 closest books
top_k_indices = np.argsort(cos_similarities[0])[-20:][::-1]  # Top 6 indices with highest similarity

# Print the results (book indices and cosine similarity scores)
for i, idx in enumerate(top_k_indices):

    print(f"{filtered_books_df_unique.iloc[idx]['title']} | {filtered_books_df_unique.iloc[idx]['genre_consolidated']}")
    # print(f"Recommendation {i+1}: Book Index {idx} (Cosine Similarity: {cos_similarities[0][idx]:.4f})")


GMAT for Dummies | Study Aids / General
CFS traced to childhood trauma, emotional instability, stress.(Across Specialties)(chronic fatigue syndrome): An article from: Clinical Psychiatry News | Medical / General
Murach's ASP.NET 2.0 Web Programming with VB 2005 | Computers / General
Huntington Beach (CA) (Images of America) | History / Expeditions & Discoveries
Blues Harmonica Collection | Music / General
Enterprise Messaging Using JMS and IBM WebSphere | Computers / General
Java Design Patterns: A Tutorial | Computers / General
Welding Fabrication and Repair: Questions & Answers | Technology & Engineering / General
802.11 Wireless Networks: The Definitive Guide (O'Reilly Networking) | Technology & Engineering / General
The forest ranger: A study in administrative behavior | Political Science / General
Photoshop for Right-Brainers: The Art of Photo Manipulation | Computers / General
Pasta: Every Way for Every Day | Cooking / Individual Chefs & Restaurants
Battle for the Hague 1940: The

### Other stuff

In [None]:
book_tower = model.book_model

In [None]:
book_dataset = tf.data.Dataset.from_tensor_slices({
    'target_book': tf.constant(numerical_books_df['title'].tolist(), dtype=tf.int64),
    'authors': tf.constant(numerical_books_df['authors'].tolist(), dtype=tf.int64),
    'categories': tf.constant(numerical_books_df['categories'].tolist(), dtype=tf.int64),
    'description': tf.constant(numerical_books_df['description'].tolist(), dtype=tf.int64),
}).batch(128)  # Optional batching


In [None]:
book_embeddings = []

for batch in book_dataset:
    batch_embeddings = book_tower(batch)
    book_embeddings.append(batch_embeddings)

book_embeddings = tf.concat(book_embeddings, axis=0)
print("Book Embeddings Shape:", book_embeddings.shape)


In [None]:
books_df.distinct('title')

In [None]:
np.save("book_embeddings.npy", book_embeddings.numpy())
books_df.to_csv('books_df.csv')

In [None]:
# Save model.tar.gz to required S3 bucket
#s3://w210recsys/model/recModel/modelFiles/
bucket_name="w210recsys"
key_prefix="model/recModel/modelFiles"
s3_response = sm_session.upload_data("book_embeddings.npy", bucket=bucket_name, key_prefix=key_prefix)

In [None]:
# Save model.tar.gz to required S3 bucket
#s3://w210recsys/model/recModel/modelFiles/
bucket_name="w210recsys"
key_prefix="model/recModel/modelFiles"
s3_response = sm_session.upload_data("books_df.csv", bucket=bucket_name, key_prefix=key_prefix)

In [None]:
file = tarfile.open('model.tar.gz')

file.extractall('./extractTar')

file.close()

# def list_tar_gz_contents(file_path):
#     try:
#         with tarfile.open(file_path, "r:gz") as tar:
#             for member in tar.getmembers():
#                 print(member.name)
#     except Exception as e:
#         print(f"An error occurred: {e}")

In [None]:
# def load_book_embeddings(self, path, books_df):

#     # We want to load in books' embeddings to make sure our model has them on hand to give direct recommendations
#     # Load in via boto3 and sagemaker

#     role = sagemaker.get_execution_role()
#     sm_session = sagemaker.Session()
#     bucket_name = sm_session.default_bucket()
#     s3 = boto3.client('s3')

#     # Download the file from S3 into memory
#     response = s3.get_object(Bucket=bucket_name, Key=path)

#     # Read the data into a BytesIO buffer
#     buffer = io.BytesIO(response['Body'].read())

#     # Load numpy array from buffer
#     books_df['embeddings'] = [embed for embed in np.load(buffer)]

#     self.full_book_embeddings = books_df
    
#     print(self.full_book_embeddings)

## Test Metrics

In [None]:
user_tower = model.user_model
book_tower = model.book_model

In [None]:
@tf.function
def evaluate_model(dataset):
    return model.evaluate(dataset, return_dict=True)

metrics = evaluate_model(num_test_ds_cached)

## Save Model + Book Embeddings & Data

In [None]:
import tarfile
import os

# Save both models into separate folders
model.user_model.save("export/user_model/1")   # Save user model
#model.book_model.save("export/book_model/1")   # Save book model

# Create tar.gz archive
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("export", arcname=os.path.basename("export"))

print("✅ Both models saved and compressed successfully!")


In [None]:
import tarfile
import os

# Save both models into separate folders
model.save("export/parent_model/1")   # Save parent model

# Create tar.gz archive
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("export", arcname=os.path.basename("export"))

print("✅ Both models saved and compressed successfully!")

Push the tar.gz files to the s3 bucket

In [None]:
import sagemaker
import pickle
import boto3

# instantiate clients
role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
bucket_name = sm_session.default_bucket()
s3 = boto3.client('s3')

s3_response = sm_session.upload_data("model.tar.gz", bucket=bucket_name, key_prefix="test_models")


In [None]:
import sagemaker
import pickle
import boto3

# instantiate clients
role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
bucket_name = sm_session.default_bucket()
s3 = boto3.client('s3')

file_name = 'embeddings/book_embeddings.npy'
local_file_path = 'book_embeddings.npy'

s3.upload_file(local_file_path, bucket_name, file_name)
print(f"✅ Embeddings uploaded to s3://{bucket_name}/{file_name}")

## Load in a Saved Model + Book Embeddings

In [None]:
import tensorflow as tf
import boto3
import tarfile
import sagemaker

# instantiate clients
role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
bucket_name = sm_session.default_bucket()
s3 = boto3.client('s3')

# Download model.tar.gz
s3.download_file(bucket_name, "test_models/model.tar.gz", "model.tar.gz")

# Unzip the model
with tarfile.open("model.tar.gz", "r:gz") as tar:
    tar.extractall("export")

# Load models
user_model = tf.keras.models.load_model("export/user_model/1")
book_model = tf.keras.models.load_model("export/book_model/1")

print("✅ Both models loaded successfully!")


In [None]:
import boto3
import numpy as np
import io
import sagemaker

# Instantiate clients
role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
bucket_name = sm_session.default_bucket()
s3 = boto3.client('s3')

# File path in S3
file_name = 'embeddings/book_embeddings.npy'

# Download the file from S3 into memory
response = s3.get_object(Bucket=bucket_name, Key=file_name)

# Read the data into a BytesIO buffer
buffer = io.BytesIO(response['Body'].read())

# Load numpy array from buffer
book_embeddings_1 = np.load(buffer)

print("✅ Embeddings loaded successfully")
print("Shape:", book_embeddings_1.shape)


## Get Book Embeddings for Each Genre

In [None]:
books_df_unique.shape

In [None]:
book_embeddings[0:3]

In [None]:
books_df_unique['Embeddings'] = [embed.numpy() for embed in book_embeddings]

In [None]:
books_df_unique.head()

In [None]:
avg_genre_emb_df = pd.DataFrame(columns=["genre", "avg_embedding"])

In [None]:
avg_genre_emb_df = pd.DataFrame(columns=["genre", "avg_embedding"])

for i, genre in enumerate(books_df_unique['genre_consolidated'].unique()):
    print(i, genre)
    # print(books_df_unique[books_df_unique['genre_consolidated'] == genre]['Embeddings'])

    embedding_matrix = np.array(books_df_unique[books_df_unique['genre_consolidated'] == genre]['Embeddings'].tolist())
    
    average_embedding = embedding_matrix.mean(axis=0)
    
    avg_genre_emb_df.loc[i] = [genre, average_embedding]

In [None]:
avg_genre_emb_df

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA

df = avg_genre_emb_df

df["avg_embedding"] = df["avg_embedding"].apply(lambda x: np.array(eval(x)) if isinstance(x, str) else np.array(x))

X = np.vstack(df["avg_embedding"].values)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2", "PC3"])
df_pca["genre"] = df["genre"]

fig = px.scatter_3d(df_pca, x="PC1", y="PC2", z="PC3", text="genre",
                     color="genre", opacity=0.8, title="3D PCA Visualization of Book Genres")

fig.update_traces(marker=dict(size=6, opacity=0.7), textposition="top center")

fig.show()

In [None]:
import sagemaker
import pickle
import boto3
import pandas as pd

role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
bucket_name = sm_session.default_bucket()
s3 = boto3.client('s3')

pickle_file_name = "embeddings/avg_genre_embeddings.pkl"
avg_genre_emb_df.to_pickle(pickle_file_name)

s3.upload_file(pickle_file_name, bucket_name, pickle_file_name)
print(f"✅ Pickle file uploaded to s3://{bucket_name}/{pickle_file_name}")