In this notebook I will build off of my previous work with the TFRS pipline to simplify the model's towers and to also improve the users' embeddings by incorperating additional session metrics to pass through.

In [None]:
import os

os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q plotnine

In [None]:
import s3fs

import io
import datetime
import json

import random

from typing import List, Union, Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

import plotnine
import gdown


from sklearn.model_selection import train_test_split

In [None]:
# Ensure GPUs are visible
gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

if gpus:
    # Set memory growth to avoid allocation errors
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

# Set logical device configuration for CPU
cpus = tf.config.list_physical_devices('CPU')
if cpus:
    tf.config.set_logical_device_configuration(
        cpus[0],
        [tf.config.LogicalDeviceConfiguration()]
    )

print("Logical devices configured.")


Initialize access to s3 bucket

To start off we import the datasets for books and their reviews, performing some data cleaning

In [None]:
# Import books and reviews dataset
# books_data_location = 's3://w210recsys/book_raw/books_data.csv'
#review_location = 's3://w210recsys/book_raw/Books_rating.csv'

books_data_location = "s3://w210recsys/book_clean/books_data_clean.pkl"

In [None]:
books_df = pd.read_pickle(books_data_location) 

books_df.shape

Data Cleansing

In [None]:
books_df.isnull().sum()

## Augment Dataset (Do Not Run)

As was demonstrated by Ben's EDA on user rating, we see an overwheling amount of ratings are positively skewwed and similarly that many people only have a few review they ever leave. The twin tower model will perform better if it gets example of both what the user likes and what they don't like so we want to augment user's reviews with books they did not interact with compared to the ones they did.

In [None]:
# def aug_data(catalog, user_data_dict, user_last_dates, k_per_user):
#     """
#     Returns a new book sample set from the catalog for multiple users at once, ensuring dates are not earlier than 
#     the last review date per user.

#     Parameters:
#     - catalog (pd.DataFrame): Full book catalog.
#     - user_data_dict (dict): A dictionary mapping users to a set of interacted books.
#     - user_last_dates (dict): A dictionary mapping users to their last review date.
#     - k_per_user (int): Number of new books per user.

#     Returns:
#     - pd.DataFrame: A DataFrame with new book samples for all users.
#     """

#     # Flatten all interacted books into a set (fast filtering)
#     interacted_books = set.union(*user_data_dict.values())

#     # Efficient filtering: Remove all interacted books at once
#     filtered_catalog = catalog[~catalog[['title', 'author']].apply(tuple, axis=1).isin(interacted_books)]

#     # Prepare a list to store new samples
#     new_samples = []

#     for user, _ in user_data_dict.items():
#         # Randomly sample `k_per_user` books
#         sampled_books = filtered_catalog.sample(n=min(k_per_user, len(filtered_catalog)), random_state=42).copy()
#         sampled_books['user_id'] = user  # Assign user ID
#         sampled_books['review_score'] = 0  # The user didn't interact with it

#         # Get the last review date for this user
#         last_review_date = user_last_dates.get(user, pd.Timestamp.now())  # Default to now if no history
        
#         # Ensure last_review_date is a pandas Timestamp (datetime64)
#         last_review_date = pd.Timestamp(last_review_date)
        
#         # Generate random timedelta and subtract from last_review_date
#         sampled_books['review_time'] = last_review_date - pd.to_timedelta(
#             [random.randint(1, 30) for _ in range(len(sampled_books))], unit="D"
#         )

#         new_samples.append(sampled_books)

#     # Concatenate all samples into a single DataFrame
#     return pd.concat(new_samples, ignore_index=True)

# # Sample Data
# temp = books_df#.iloc[0:100]

# # Step 1: Convert user interactions into a dictionary {user_id: {(title, author), ...}}
# user_data_dict = (
#     temp.groupby('user_id')[['title', 'author']]
#     .apply(lambda df: set(df.itertuples(index=False, name=None)))
#     .to_dict()
# )

# # Step 2: Extract last review date per user
# user_last_dates = (
#     temp.groupby('user_id')['review_time']
#     .max()
#     .to_dict()
# )

# # Step 3: Call optimized function for batch augmentation
# augmented_data = aug_data(books_df, user_data_dict, user_last_dates, k_per_user=3)

# # Step 4: Append new recommendations to merged_df efficiently
# temp = pd.concat([temp, augmented_data], ignore_index=True)

# print(f"Augmented dataset size: {temp.shape}")


Save the data to pickle file in the s3 bucket

In [None]:
# temp.to_pickle("s3://w210recsys/aug_data/clean_augmented_data_v1.pkl")

Load the data to avoid overheads

In [None]:
# merged_df = pd.read_pickle("s3://w210recsys/aug_data/clean_augmented_data_v1.pkl")

# merged_df.shape

In [None]:
# merged_df[merged_df['user_id'] == 'A1SMFD252FTJP9']

## Session-ize Data + Split Dataset for Validation based on Users
In the earlier versions of our twin-tower recommendations model we were splitting the data based on dates to isolate the last interaction as our validation data and passing in other interactions, line by line, in as training data. However, upon review we realized that this approach has some flaws:

1. Data Leakage - By passing in the same users in training and testing we may be getting an inflated sense of how good the model is doing.
2. Line by line data - The goal of our recommendation system is to take in a user's metrics at once and provide a user embedding that will likely set them closer tot he vooks they like in the embedding space. However, by passing in data line by line, we don't aggregate this data in the same way and the model may not be learning that

For these reasons, we opt to split the data by users, holding out their last interaction as the label, rather than by date.

Session-izing Data:

We want to group the historical data for each user in a way that allows us to mimic the session data we will collect from users in deployment. The basic structure of which will be to summarize past interactions and hold out a separate book interaction.

In [None]:
# # Test on a smaller sub-set of the data

# merged_df = merged_df.sort_values(by=['user_id', 'review_date'])


In [None]:
# temp.columns

In [None]:
# temp = merged_df[merged_df['user_id'] == 'A1SMFD252FTJP9']

In [None]:
# def session_summary(user_data):

#     """
#     session_summary takes in each user's session data and returns a summarized verison of it

#     inputs:
#     user_data: user's interaction data
#     interest_cols: columns of interest wanting to be summarized

#     outputs:
#     summarized_data: the user's summarized data

#     """

#     user_data = user_data.sort_values(by=['user_id', 'review_time'])  # Sort by user & time
    
#     session_data = []
    
#     for user, user_df in user_data.groupby('user_id'):
#         if len(user_df) < 2:
#             continue  # Skip users with only one interaction
        
#         # Last interaction is the target (book user last interacted with)
#         target_row = user_df.iloc[-1]
#         target_book = target_row['title']
#         target_book_rating = target_row['review_score']
        
#         # Previous interactions (session history3
#         history_df = user_df.iloc[:-1]  # Exclude last row]

#         summary = {
#             'user_id': user,
#             'liked_books': list(history_df.loc[history_df['review_score'] >= 3, 'title']),
#             'disliked_books': list(history_df.loc[history_df['review_score'] < 3, 'title']),
#             'liked_genres': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] >= 3, 'genre_consolidated'])))),
#             'disliked_genres': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] < 3, 'genre_consolidated'])))),
#             'liked_authors': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] >= 3, 'author'])))),
#             'disliked_authors': list(filter(lambda x: x != "", list(set(history_df.loc[history_df['review_score'] < 3, 'author'])))),
#             'liked_ratings': list(history_df.loc[history_df['review_score'] >= 3, 'review_score']),
#             'disliked_ratings': list(history_df.loc[history_df['review_score'] < 3, 'review_score']),
#             'target_book': target_book,
#             'target_book_rating': target_book_rating
#         }
        
#         session_data.append(summary)
    
#     return pd.DataFrame(session_data)

# # Generate session summaries with held-out target sample
# # session_summary(temp)

# # Generate a full df of session summaries
# # sessionized_df = session_summary(merged_df)
# sessionized_df = session_summary(sampled_books)

In [None]:
# sessionized_df.to_pickle("s3://w210recsys/aug_data/cleaned_sessionized_data_v1.pkl")

To run this function in a timely manner I had to leverage a much larger compute instance than the one this notebook was created on. I saved the result to the team s3 bucket

## Load Sessionized Data

In [None]:
sessionized_df = pd.read_pickle("s3://w210recsys/aug_data/cleaned_sessionized_data_v1.pkl")

In [None]:
sessionized_df.shape

In [None]:
sessionized_df.head()

## Augment Data

Down the line I realized that when training we'll want to pass in more than just the book's title to our book tower so I want to augment the dataset with the author and the summaries of all of those books

In [None]:
books_df.columns

In [None]:
from tqdm import tqdm

# Keep just the first instance of each book since we only care about the books' metadata, which should be the same regardless
books_df_unique = books_df.drop_duplicates(subset=['title'], keep='first')

# Convert books_df to a dictionary for fast lookup
books_dict = books_df_unique.set_index('title')[['author', 'description', 'genre_consolidated']].to_dict(orient='index')

target_book_author = []
target_book_summary = []
target_book_categories = []

for title in tqdm(sessionized_df['target_book'], desc="Processing Books"):
    book_info = books_dict.get(title, {'author': '', 'description': '', 'genre_consolidated': ''})

    target_book_author.append(book_info['author'])
    target_book_summary.append(book_info['description'])
    target_book_categories.append(book_info['genre_consolidated'])


In [None]:
# IMPORTANT NOTE: Though I'm keeping the column name as 'categories' it should reflect 'genre_consolidated' at all times henceforth

sessionized_df['authors'] = target_book_author
sessionized_df['description'] = target_book_summary
sessionized_df['categories'] = target_book_categories

Now that we have the sessionized data, we can move into the splitting of data

In [None]:
# train_df, test_df = train_test_split(sessionized_df, test_size=0.3, random_state=42)

In [None]:
# train_df.columns

## Converting string input to numerical

So the model, when running on CPUs, is able to take care of string entries to numerical via our lookup layers internally. However, when trying to utilize the GPU for faster training, it appears as through string tensor conversions from CPU to GPU aren't supported. As a result, I'm looking into converting our string entries to numerical values prior to passing it into the model.

In [None]:
# Extract unique values for user and book metadata
unique_user_ids = sessionized_df['user_id'].astype(str).unique().tolist()
unique_book_titles = books_df['title'].astype(str).unique().tolist()
unique_genres = books_df['genre_consolidated'].astype(str).unique().tolist()
unique_authors = books_df['author'].astype(str).unique().tolist()
unique_summaries = books_df['description'].astype(str).unique()

In [None]:
len(unique_user_ids), len(unique_book_titles), len(unique_genres), len(unique_authors)

In [None]:
embedding_dimensions = 64 # 64

# Create a StringLookup layer for user_id
user_id_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids,
    mask_token=None,
    name='user_id_vocab'
)

# Create an Embedding layer for user_id
user_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_user_ids) + 1, 
    output_dim=embedding_dimensions, 
    name='user_id_embedding'
)

# Create a StringLookup layer for book_title
book_title_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_book_titles,
    mask_token=None,
    name='book_title_vocab'
)

# Create an Embedding layer for book_title
book_title_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_book_titles) + 1, 
    output_dim=embedding_dimensions, 
    name='book_title_embedding'
)

# Create a StringLookup layer for genre
book_genre_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_genres,
    mask_token=None,
    name='book_genre_vocab'
)

# Create an Embedding layer for genre
book_genre_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_genres) + 1, 
    output_dim=embedding_dimensions, 
    name='book_genre_embedding'
)

# Create a StringLookup layer for authors
book_authors_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_authors,
    mask_token=None,
    name='book_authors_vocab'
)

# Create an Embedding layer for authors
book_authors_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_authors) + 1, 
    output_dim=embedding_dimensions, 
    name='book_authors_embedding'
)

# Create a StringLookup layer for description
book_description_vocab_layer = tf.keras.layers.StringLookup(
    vocabulary=unique_summaries,
    mask_token=None,
    name='book_description_vocab'
)

# Create an Embedding layer for descriptions
book_description_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(unique_summaries) + 1, 
    output_dim=embedding_dimensions, 
    name='book_description_embedding'
)

# Save the StringLookUp layers!

In [None]:
config = {
    "vocabulary": unique_user_ids,
    "oov_token": user_id_vocab_layer.oov_token
}

with open('user_id_vocab_layer.json', 'w') as f:
    json.dump(config, f)

In [None]:
# A014566028TLL40XCY1YR

with open("user_id_vocab_layer.json", 'r') as f:
    config = json.load(f)

user_id_vocab_layer = tf.keras.layers.StringLookup(vocabulary=config['vocabulary'], oov_token=config['oov_token'])

input_data = "A3A48XEYWLWH7T"

output = user_id_vocab_layer(input_data)
print(output)

In [None]:
int(output)

In [None]:
# Initialize S3 client
import boto3
# import subprocess
# import os
# import pickle
# import joblib
# import tarfile
# import shutil
import sagemaker

role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
s3 = boto3.client('s3')

In [None]:
# s3://w210recsys/model/recModel/modelFiles/
bucket_name="w210recsys"
key_prefix="model/recModel/modelFiles"
s3_response = sm_session.upload_data("book_authors_vocab_layer.json", bucket=bucket_name, key_prefix=key_prefix)

# Convert the information from books_df to be numerical using the string lookups

In [None]:
numerical_books_df = books_df[['title', 'author', 'genre_consolidated', 'description']].copy()
numerical_books_df.columns = ['title', 'authors', 'categories', 'description']

numerical_books_df = numerical_books_df.drop_duplicates(subset=['title'], keep='first')


numerical_books_df

In [None]:
# Apply the lookup transformation to each column

numerical_books_df['title'] = book_title_vocab_layer(numerical_books_df['title']).numpy()
numerical_books_df['authors'] = book_authors_vocab_layer(numerical_books_df['authors']).numpy()
numerical_books_df['categories'] = book_genre_vocab_layer(numerical_books_df['categories']).numpy()
numerical_books_df['description'] = book_description_vocab_layer(numerical_books_df['description']).numpy()

# Also add in the 'book_id'
numerical_books_df['book_id'] = [i for i in range(0, numerical_books_df.shape[0])]

numerical_books_df

In [None]:
numerical_sessionized_df = sessionized_df.copy()

numerical_sessionized_df.columns

In [None]:
len(numerical_sessionized_df)

In [None]:
numerical_sessionized_df.head()

In [None]:
# Function to apply lookup layer in a vectorized manner
def fast_lookup_list(values, lookup_layer):
    values = values.apply(lambda x: x if isinstance(x, list) else ["UNKNOWN"])  # Ensure lists
    tensor_input = tf.ragged.constant(values.tolist(), dtype=tf.string)  # Convert to ragged tensor
    return lookup_layer(tensor_input).numpy().tolist()  # Apply lookup in batch and convert to list

# Apply lookup in bulk for better performance
numerical_sessionized_df['user_id'] = user_id_vocab_layer(numerical_sessionized_df['user_id'].astype(str)).numpy()

numerical_sessionized_df['liked_books'] = fast_lookup_list(numerical_sessionized_df['liked_books'], book_title_vocab_layer)
numerical_sessionized_df['disliked_books'] = fast_lookup_list(numerical_sessionized_df['disliked_books'], book_title_vocab_layer)

numerical_sessionized_df['liked_genres'] = fast_lookup_list(numerical_sessionized_df['liked_genres'], book_genre_vocab_layer)
numerical_sessionized_df['disliked_genres'] = fast_lookup_list(numerical_sessionized_df['disliked_genres'], book_genre_vocab_layer)

numerical_sessionized_df['liked_authors'] = fast_lookup_list(numerical_sessionized_df['liked_authors'], book_authors_vocab_layer)
numerical_sessionized_df['disliked_authors'] = fast_lookup_list(numerical_sessionized_df['disliked_authors'], book_authors_vocab_layer)

numerical_sessionized_df['target_book'] = book_title_vocab_layer(numerical_sessionized_df['target_book'].astype(str)).numpy()

numerical_sessionized_df['authors'] = book_authors_vocab_layer(numerical_sessionized_df['authors'].astype(str)).numpy()

numerical_sessionized_df['description'] = book_description_vocab_layer(numerical_sessionized_df['description'].astype(str)).numpy()

numerical_sessionized_df['categories'] = book_genre_vocab_layer(numerical_sessionized_df['categories'].astype(str)).numpy()

# Increment the ratings up by 1 to leave 0 to be the padding

def rating_shift(ratings):

    if type(ratings) == list:
        if len(ratings) == 0:
            return ratings

        return [entry + 1 for entry in ratings]

    else:

        return ratings + 1


numerical_sessionized_df['liked_ratings'] = numerical_sessionized_df['liked_ratings'].apply(lambda x: rating_shift(x))
numerical_sessionized_df['disliked_ratings'] = numerical_sessionized_df['disliked_ratings'].apply(lambda x: rating_shift(x))
numerical_sessionized_df['target_book_rating'] = numerical_sessionized_df['target_book_rating'].apply(lambda x: rating_shift(x))


In [None]:
numerical_sessionized_df.head()

In [None]:
HT = {}

for col in numerical_sessionized_df.columns:
    try:
        for entry in numerical_sessionized_df[col]:
            
            if col not in HT:
                HT[col] = [len(entry)]
            else:
                HT[col].append(len(entry))
    except:
        HT[col] = 1

So later down the line I noticed that we can't call our mode for inferences with ragged tensors since their sizes can vary and the model needs fixed sizes.

In [None]:
for col in HT.keys():
    print(col, int(np.percentile(HT[col], 90)))

We need to consider what *most* of our samples' lengths are and add in some extra space for future inferences with more information. Since in the future I plan on having feed back on every 10 books or so I'll allocate ~20 slots for each list just to be safe.

In [None]:
HT['liked_books'] = 20
HT['disliked_books'] = 20

HT['liked_genres'] = 20
HT['disliked_genres'] = 20

HT['liked_authors'] = 20
HT['disliked_authors'] = 20

HT['liked_ratings'] = 20
HT['disliked_ratings'] = 20

HT

In [None]:
def pad_numpy_array(column, max_len, padding_value=0):
    return column.apply(lambda x: np.pad(x[:max_len], (0, max_len - min(len(x), max_len)), constant_values=padding_value))

def pad_column(column, max_len, padding_value=0):
    return column.apply(lambda x: (x[:max_len] if len(x) > max_len else x + [padding_value] * (max_len - len(x))))

for column, max_len in HT.items():
    first_entry = numerical_sessionized_df[column].iloc[0]
    
    if isinstance(first_entry, np.ndarray):
        numerical_sessionized_df[column] = pad_numpy_array(numerical_sessionized_df[column], max_len)
        
    elif isinstance(first_entry, list):
        numerical_sessionized_df[column] = pad_column(numerical_sessionized_df[column], max_len)
        

In [None]:
numerical_sessionized_df.head()

In [None]:
num_train_df, num_test_df = train_test_split(numerical_sessionized_df, test_size=0.3, random_state=42)

In [None]:
num_test_df.head()

## Save need data files

In [None]:
print("🚨 Starting dataset creation...")

num_train_ds = tf.data.Dataset.from_tensor_slices({
    'user_id': tf.constant(num_train_df['user_id'].tolist(), dtype=tf.int64),
    'liked_books': tf.constant(num_train_df['liked_books'].tolist(), dtype=tf.int64),
    'disliked_books': tf.constant(num_train_df['disliked_books'].tolist(), dtype=tf.int64),
    'liked_genres': tf.constant(num_train_df['liked_genres'].tolist(), dtype=tf.int64),
    'disliked_genres': tf.constant(num_train_df['disliked_genres'].tolist(), dtype=tf.int64),
    'liked_authors': tf.constant(num_train_df['liked_authors'].tolist(), dtype=tf.int64),
    'disliked_authors': tf.constant(num_train_df['disliked_authors'].tolist(), dtype=tf.int64),
    'liked_ratings': tf.constant(num_train_df['liked_ratings'].tolist(), dtype=tf.float32),
    'disliked_ratings': tf.constant(num_train_df['disliked_ratings'].tolist(), dtype=tf.float32),
    'target_book': tf.constant(num_train_df['target_book'], dtype=tf.int64),
    'authors': tf.constant(num_train_df['authors'], dtype=tf.int64),
    'description': tf.constant(num_train_df['description'], dtype=tf.int64),
    'categories': tf.constant(num_train_df['categories'], dtype=tf.int64),
    'target_book_rating': tf.constant(num_train_df['target_book_rating'], dtype=tf.float32),
})

num_test_ds = tf.data.Dataset.from_tensor_slices({
    'user_id': tf.constant(num_test_df['user_id'].tolist(), dtype=tf.int64),
    'liked_books': tf.constant(num_test_df['liked_books'].tolist(), dtype=tf.int64),
    'disliked_books': tf.constant(num_test_df['disliked_books'].tolist(), dtype=tf.int64),
    'liked_genres': tf.constant(num_test_df['liked_genres'].tolist(), dtype=tf.int64),
    'disliked_genres': tf.constant(num_test_df['disliked_genres'].tolist(), dtype=tf.int64),
    'liked_authors': tf.constant(num_test_df['liked_authors'].tolist(), dtype=tf.int64),
    'disliked_authors': tf.constant(num_test_df['disliked_authors'].tolist(), dtype=tf.int64),
    'liked_ratings': tf.constant(num_test_df['liked_ratings'].tolist(), dtype=tf.float32),
    'disliked_ratings': tf.constant(num_test_df['disliked_ratings'].tolist(), dtype=tf.float32),
    'target_book': tf.constant(num_test_df['target_book'], dtype=tf.int64),
    'authors': tf.constant(num_test_df['authors'], dtype=tf.int64),
    'description': tf.constant(num_test_df['description'], dtype=tf.int64),
    'categories': tf.constant(num_test_df['categories'], dtype=tf.int64),
    'target_book_rating': tf.constant(num_test_df['target_book_rating'], dtype=tf.float32),
})

print("✅ Datasets successfully created!")


In [None]:
for example in num_train_ds.take(1):

    print(example)

    break

In [None]:
num_train_ds_limited = num_train_ds#.take(500)  # Limit to 1000 samples
num_test_ds_limited = num_test_ds#.take(500)  # Limit to 500 samples

num_train_ds_cached = num_train_ds_limited.batch(128).cache()#.batch(128).cache()
num_test_ds_cached = num_test_ds_limited.batch(128).cache()

## Build Twin-Tower Model

Compared to the original version of our twin tower model, I want to simplify the input scheme to the user and book towers and incorporate some additional features for user embedding. Furthermore, we want to pass in sessionized user data separately from book data to each tower.

In [None]:
import tensorflow_recommenders as tfrs
import tensorflow.keras.layers as layers
from typing import Dict
import time

import boto3
import io
import sagemaker

class BooksTwoTowersModel(tfrs.Model):
    def __init__(self, user_data: pd.DataFrame, book_metadata: pd.DataFrame, embedding_dimensions=256):
        super().__init__()

        self.dense_projection_user = tf.keras.layers.Dense(embedding_dimensions, name='user_dense_projection')
        
        self.dense_projection_book = tf.keras.layers.Dense(embedding_dimensions, name='book_dense_projection')

        self.user_model = UserModel(user_data, book_metadata, self.dense_projection_user, embedding_dimensions)

        self.book_model = BookModel(book_metadata, embedding_dimensions, 10000, self.dense_projection_book)

        self.candidate_ds = tf.data.Dataset.from_tensor_slices({
            'title': tf.convert_to_tensor(book_metadata['title'].values, dtype=tf.int64),
            'authors': tf.convert_to_tensor(book_metadata['authors'].values, dtype=tf.int64),
            'description': tf.convert_to_tensor(book_metadata['description'].values, dtype=tf.int64),
            'categories': tf.convert_to_tensor(book_metadata['categories'].values, dtype=tf.int64)
        })

        candidates = self.candidate_ds.batch(1).map(
            lambda x: self.book_model(x), num_parallel_calls=tf.data.AUTOTUNE
        ).map(
            lambda x: tf.squeeze(x, axis=0)
        )

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=candidates.batch(1),
                ks=(10, 20, 50)
            )
        )

        self.full_book_embeddings = None
        self.full_book_embeddings_copy = None

    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:

        user_embeddings = self.user_model(features)

        target_book_embeddings = self.book_model(features)

        retrieval_loss = self.task(user_embeddings, target_book_embeddings, compute_metrics=not training)

        return retrieval_loss
    

### Book and User model

In [None]:
class BookModel(tf.keras.Model):
    '''
    The book(query) tower that processes book data.
    '''

    def __init__(self, book_data: pd.DataFrame, embedding_dimensions: int, text_vectorization_max_tokens: int, dense_projection_book): #, book_title_weight_layer, book_author_weight_layer, book_genre_weight_layer):
        '''
        :param book_data: DataFrame containing book information.
        :param embedding_dimensions: Number of dimensions in embedding layer.
        :param text_vectorization_max_tokens: Maximum number of tokens to vector.
        '''
        super().__init__()

        # Extract unique values for embeddings
        self.feature_book_title_name = "title"
        self.feature_author_name = "authors"
        self.feature_genre_name = "categories"

        unique_titles = book_data[self.feature_book_title_name].astype(str).unique()
        unique_authors = book_data[self.feature_author_name].astype(str).unique()
        unique_genres = book_data[self.feature_genre_name ].astype(str).unique()

        self.dense_projection_book = dense_projection_book
        
        # Book Title embedding
        self.book_title_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_titles) + 1, output_dim=embedding_dimensions, name='book_title_embedding')

        # Book Author embedding
        self.book_author_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_authors) + 1, output_dim=embedding_dimensions, name='book_author_embedding')

        # Book Genere embedding
        self.book_genre_emdedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_genres) + 1, output_dim=embedding_dimensions, name='book_genre_embedding')
    
        # print("Finsihed setting up book tower\n")

    def call(self, book_data: Dict[str, tf.Tensor]) -> tf.Tensor:
        
        # Handle case where 'target_book' might not exist
        try:
            if len(book_data['target_book'].shape) == 0:
                book_data['target_book'] = tf.expand_dims(book_data['target_book'], axis=0)
            
            book_title_embed = self.book_title_embedding_layers(book_data['target_book'])
        except KeyError:
            if len(book_data['title'].shape) == 0:
                book_data['title'] = tf.expand_dims(book_data['title'], axis=0)
                
            book_title_embed = self.book_title_embedding_layers(book_data['title'])
        
        if len(book_data['authors'].shape) == 0:
            book_data['authors'] = tf.expand_dims(book_data['authors'], axis=0)
            book_data['categories'] = tf.expand_dims(book_data['categories'], axis=0)
            
        book_author_embed = self.book_author_embedding_layers(book_data['authors'])
        book_genre_embed = self.book_genre_emdedding_layers(book_data['categories'])
        
        # Concatenation without expand_dims
        concatenated_embeddings = tf.concat([
            book_title_embed,
            book_author_embed,
            book_genre_embed
            ], axis=-1)  # Use last axis for feature concat
    
        # Apply projection to 64D embedding
        projected_embeddings = self.dense_projection_book(concatenated_embeddings)
    
        return projected_embeddings

In [None]:
class UserModel(tf.keras.Model):
    def __init__(self, user_data: pd.DataFrame, book_metadata: pd.DataFrame, dense_projection_user, embedding_dimensions=64):
        super().__init__()

        # Extract unique values from user and book metadata
        unique_user_ids = user_data['user_id'].astype(str).unique().tolist()
        unique_book_titles = book_metadata['title'].astype(str).unique().tolist()
        unique_genres = book_metadata['categories'].astype(str).unique().tolist()
        unique_authors = book_metadata['authors'].astype(str).unique().tolist()

        self.dense_projection_user = dense_projection_user
        
        # User embedding
        self.user_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_user_ids) + 1, output_dim=embedding_dimensions, name='user_id_embedding')

        # Book embedding
        self.book_title_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_book_titles) + 1, output_dim=embedding_dimensions, name='book_embedding')

        # Genre embedding
        self.genre_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_genres) + 1, output_dim=embedding_dimensions, name='genre_embedding')

        # Author embedding
        self.author_embedding_layers = tf.keras.layers.Embedding(input_dim=len(unique_authors) + 1, output_dim=embedding_dimensions, name='author_embedding')
        
        # print("Finsihed setting up user tower\n")
    
    def call(self, inputs):

        if str(type(inputs)) != "<class 'tensorflow.python.framework.ops.EagerTensor'>":
            pass
        else:
            inputs = {
                'user_id': tf.expand_dims(inputs[0][0], axis = 0),
                'liked_books': tf.expand_dims(inputs[1], axis = 0),
                'disliked_books': tf.expand_dims(inputs[2], axis = 0),
                'liked_genres': tf.expand_dims(inputs[3], axis = 0),
                'disliked_genres': tf.expand_dims(inputs[4], axis = 0),
                'liked_authors': tf.expand_dims(inputs[5], axis = 0),
                'disliked_authors': tf.expand_dims(inputs[6], axis = 0),
                'liked_ratings': tf.expand_dims(inputs[7], axis = 0),
                'disliked_ratings': tf.expand_dims(inputs[8], axis = 0)
            }
        
        user_embed = self.user_embedding_layers(inputs['user_id'])

        def pool_embeddings(embedding_layer, input_list, weights, embedding_dim=64, pad_value=0):
            # Get embeddings
            embeddings = embedding_layer(input_list)
        
            # Create mask
            mask = tf.not_equal(input_list, pad_value)
            mask = tf.expand_dims(mask, axis=-1)
        
            # Zero out padded embeddings
            embeddings = tf.where(mask, embeddings, tf.zeros_like(embeddings))
        
            # Normalize weights (zero-safe)
            weight_sum = tf.reduce_sum(weights, axis=-1, keepdims=True)
            weight_sum = tf.where(weight_sum == 0, tf.ones_like(weight_sum), weight_sum)
            weights = weights / weight_sum
            
            # Expand weights dims
            expanded_weights = tf.expand_dims(weights, axis=-1)
            
            # Weighted Embeddings
            weighted_embeddings = embeddings * expanded_weights
        
            # Sum + Pool
            summed_embeddings = tf.reduce_sum(weighted_embeddings, axis=1)
            valid_counts = tf.reduce_sum(tf.cast(mask, tf.float32), axis=1)
            valid_counts = tf.where(valid_counts == 0, tf.ones_like(valid_counts), valid_counts)
            pooled_embeddings = summed_embeddings / valid_counts
        
            # Fix NaNs
            pooled_embeddings = tf.where(tf.math.is_nan(pooled_embeddings), tf.zeros_like(pooled_embeddings), pooled_embeddings)
        
            return pooled_embeddings
        
        # Process liked books
        liked_books_embed = pool_embeddings(self.book_title_embedding_layers, inputs['liked_books'], inputs['liked_ratings'])

        # Process disliked books
        disliked_books_embed = pool_embeddings(self.book_title_embedding_layers, inputs['disliked_books'], inputs['disliked_ratings'])

        # Process liked genres
        liked_genres_embed = pool_embeddings(self.genre_embedding_layers, inputs['liked_genres'], inputs['liked_ratings'])

        # Process disliked genres
        disliked_genres_embed = pool_embeddings(self.genre_embedding_layers, inputs['disliked_genres'], inputs['disliked_ratings'])

        # Process liked authors
        liked_authors_embed = pool_embeddings(self.author_embedding_layers, inputs['liked_authors'], inputs['liked_ratings'])

        # Process disliked authors
        disliked_authors_embed = pool_embeddings(self.author_embedding_layers, inputs['disliked_authors'], inputs['disliked_ratings'])

        # Concatenate everything into a single user representation
        try:
            concatenated_embeddings = tf.concat([
                user_embed,
                liked_books_embed,
                disliked_books_embed,
                liked_genres_embed,
                disliked_genres_embed,
                liked_authors_embed,
                disliked_authors_embed
            ], axis=1)
        except:
            return inputs

        projected_embeddings = self.dense_projection_user(concatenated_embeddings)

        # print(f"projected_embeddings.shape: {projected_embeddings.shape}\n")
        
        return projected_embeddings

In [None]:
# Compile Model

# Enable Multi-GPU Training
strategy = tf.distribute.MirroredStrategy()
print(f"Number of GPUs being used: {strategy.num_replicas_in_sync}")

tf.config.run_functions_eagerly(True)

with strategy.scope():
    model = BooksTwoTowersModel(
        user_data=numerical_sessionized_df,
        book_metadata=numerical_books_df,
        embedding_dimensions=64, # Change this for embedding sizes to change (64 default val)
    )
    model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


In [None]:
model.fit(num_train_ds_cached, epochs=7)

See if you can get recommendations directly with filters

## Save model.user_model

In [None]:
# Initialize S3 client
import boto3
import subprocess
import os
import pickle
import joblib
import tarfile
import shutil
import sagemaker

role = sagemaker.get_execution_role()
sm_session = sagemaker.Session()
s3 = boto3.client('s3')

In [None]:
# Save model.tar.gz to required S3 bucket
# s3://w210recsys/testModels/model.tar.gz
bucket_name="w210recsys"

## Real Model
# key_prefix="model/recModel"

# Test Model
key_prefix = "testModels"
s3_response = sm_session.upload_data("model.tar.gz", bucket=bucket_name, key_prefix=key_prefix)

### Model Prediction Pipeline

In [None]:
# ['Fiction / Mystery & Detective', 'Young Adult Fiction / General',
#        'Drama / General', 'Juvenile Fiction / Fantasy & Magic',
#        'Juvenile Fiction / General', 'Fiction / World Literature',
#        'Fiction / Romance', 'Political Science / General',
#        'Fiction / Literary', 'Business & Economics / General',
#        'Juvenile Fiction / Legends, Myths, Fables',
#        'Juvenile Fiction / Science Fiction',
#        'Fiction / Fairy Tales, Folk Tales, Legends & Mythology',
#        'Fiction / Science Fiction', 'Fiction / Classics',
#        'Religion / General', 'Fiction / General', 'Fiction / Ghost',
#        'Fiction / Action & Adventure', 'Juvenile Nonfiction / General',
#        'Juvenile Fiction / Fairy Tales & Folklore',
#        'Fiction / War & Military', 'History / Maritime History & Piracy',
#        'Juvenile Fiction / Thrillers & Suspense',
#        'Comics & Graphic Novels / Superheroes',
#        'Literary Criticism / General', 'Science / General',
#        'Reference / General', 'History / General',
#        'Fiction / Occult & Supernatural', 'Philosophy / General',
#        'Computers / General',
#        'Biography & Autobiography / Personal Memoirs', 'Art / General',
#        'Fiction / Visionary & Metaphysical',
#        'Family & Relationships / General', 'Fiction / Thrillers',
#        'Health & Fitness / General', 'Fiction / Anthologies',
#        'Biography & Autobiography / General', 'Fiction / Sea Stories',
#        'Fiction / Erotica', 'Fiction / Sagas',
#        'Fiction / Magical Realism', 'Fiction / Biographical',
#        'History / Expeditions & Discoveries', 'Education / General',
#        'Juvenile Fiction / Nursery Rhymes', 'Humor / Topic',
#        'Nature / General', 'True Crime / Murder', 'Psychology / General',
#        'Social Science / General', 'Photography / General',
#        'Religion / Theology', 'Fiction / Dystopian',
#        'History / Wars & Conflicts', 'Body, Mind & Spirit / General',
#        'Fiction / Short Stories', 'History / Social History',
#        'Games & Activities / General', 'Fiction / Family Life',
#        'Comics & Graphic Novels / General', 'Fiction / City Life',
#        'Biography & Autobiography / Literary Figures',
#        'Juvenile Fiction / Short Stories', 'Fiction / Crime',
#        'Travel / Essays & Travelogues',
#        'Technology & Engineering / General', 'Drama / Shakespeare',
#        'History / Historiography', 'Bibles / General',
#        'History / Indigenous Peoples of the Americas',
#        'Cooking / Individual Chefs & Restaurants',
#        'Performing Arts / General', 'Fiction / Noir', 'Poetry / General',
#        'History / Military', 'Cooking / General', 'Travel / General',
#        'Music / General', 'Sports & Recreation / General',
#        'True Crime / General', 'Religion / Christian Theology',
#        'Language Arts & Disciplines / General',
#        'Crafts & Hobbies / General', 'Pets / General',
#        'Young Adult Nonfiction / General', 'House & Home / General',
#        'Literary Collections / General', 'Humor / General',
#        'Antiques & Collectibles / General', 'Study Aids / General',
#        'Foreign Language Study / General', 'Medical / General',
#        'Law / General', 'Mathematics / General',
#        'History / Historical Geography', 'Architecture / General',
#        'Transportation / General', 'Gardening / General',
#        'Design / General']

# Profile one
liked_genres = ['Mathematics / General']
disliked_genres = []

# Profile two
# liked_genres = ['Religion / General' [ 'Medical / General']
# disliked_genres = [] # base has some 'Family & Relationships / General' books
# disliked_genres = ['Family & Relationships / General'] # Commenting this in actually gave more of these as recs????

# Profile three
# liked_genres = ['Law / General', 'True Crime / General']
# disliked_genres = []

# Profile four
# liked_genres = ['Performing Arts / General', 'Humor / Topic']
# disliked_genres = []

# Profile five (Popular)
# liked_genres = ['Religion / General'] #, 'Fiction / World Literature']
# disliked_genres = []

# Profile five (Un-Popular)
# liked_genres = ['Study Aids / General', 'Young Adult Nonfiction / General']
# disliked_genres = []

# Profile six (Un-Popular)
# liked_genres = ['Nature / General', ' Health & Fitness / General']
# disliked_genres = []

# Profile


In [None]:
sampled_books = books_df[books_df['genre_consolidated'].isin(liked_genres)].sample(5)

sampled_books

In [None]:
sampled_books['title']

In [None]:
# Encode each book

sampled_encoded_liked_title = book_title_vocab_layer(sampled_books['title']).numpy()
sampled_encoded_liked_authors = book_authors_vocab_layer(sampled_books['author']).numpy()
sampled_encoded_liked_genres = book_genre_vocab_layer(sampled_books['genre_consolidated']).numpy()

sampled_encoded_disliked_title = np.array([], dtype='int64')
sampled_encoded_disliked_authors = np.array([], dtype='int64')
sampled_encoded_disliked_genres = np.array([], dtype='int64')

sampled_liked_rating = [6 for title in sampled_encoded_liked_title]

encoded_disliked_genres = []

for genre in disliked_genres:
    encoded_disliked_genres.append(book_genre_vocab_layer(genre).numpy())
    
disliked_ratings = [1 for title in encoded_disliked_genres]


In [None]:
sampled_encoded_liked_title

In [None]:
sampled_encoded_disliked_title 

In [None]:
sample_user_info = ({
    'user_id': [0], # Doesn't matter
    'liked_books': sampled_encoded_liked_title.tolist(),
    'disliked_books': [],

    'liked_genres': sampled_encoded_liked_genres.tolist(),
    'disliked_genres': encoded_disliked_genres,
    
    'liked_authors': sampled_encoded_liked_authors.tolist(),
    'disliked_authors':[],
    
    'liked_ratings': sampled_liked_rating,
    'disliked_ratings': disliked_ratings,
})

sample_user = []

for col in sample_user_info:

    # print(col, sample_user_info[col])
    
    sample_user_info[col].extend([0]*(20 - len(sample_user_info[col])))

    # print(sample_user_info[col])
    
    sample_user.append(sample_user_info[col])

sample_user = tf.cast(sample_user, tf.float32)

In [None]:
sample_user

In [None]:
# model.user_model.predict([[i for i in range(20)], [i for i in range(20)], [i for i in range(20)]])

user_embedding = model.user_model.predict(sample_user)

# user_embedding

In [None]:
user_embedding

Extract the books' embeddings

In [None]:
book_tower = model.book_model

book_dataset = tf.data.Dataset.from_tensor_slices({
    'target_book': tf.constant(numerical_books_df['title'].tolist(), dtype=tf.int64),
    'authors': tf.constant(numerical_books_df['authors'].tolist(), dtype=tf.int64),
    'categories': tf.constant(numerical_books_df['categories'].tolist(), dtype=tf.int64),
    'description': tf.constant(numerical_books_df['description'].tolist(), dtype=tf.int64),
}).batch(128)  # Optional batching

book_embeddings = []

for batch in book_dataset:
    batch_embeddings = book_tower(batch)
    book_embeddings.append(batch_embeddings)

book_embeddings = tf.concat(book_embeddings, axis=0)
print("Book Embeddings Shape:", book_embeddings.shape)

books_df_unique['Embeddings'] = [embed.numpy() for embed in book_embeddings]

books_df_unique.shape

In [None]:
books_df_unique.columns

In [None]:
# np.argsort(cos_similarities[0])[-40:][::-1]
cos_similarities[0]

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

filtered_books_df_unique = books_df_unique

book_embeddings = np.array(filtered_books_df_unique['Embeddings'].tolist())

# Normalize user embedding to unit vector (L2 normalization)
# user_embedding /= np.linalg.norm(user_embedding)

# Normalize all book embeddings to unit vectors (L2 normalization)
# book_embeddings /= np.linalg.norm(book_embeddings, axis=1, keepdims=True)

# Calculate the cosine similarity between user_embedding and all book embeddings
cos_similarities = cosine_similarity(user_embedding.reshape(1, -1), book_embeddings)
#cos_similarities = cosine_similarity(test, book_embeddings)
# Get the indices of the top 6 closest books
top_k_indices = np.argsort(cos_similarities[0])[-40:][::-1]  # Top 6 indices with highest similarity

# Print the results (book indices and cosine similarity scores)
for i, idx in enumerate(top_k_indices):

    print(f"{filtered_books_df_unique.iloc[idx]['title']} | {filtered_books_df_unique.iloc[idx]['genre_consolidated']} | {cos_similarities[0][idx]}")
    # print(f"Recommendation {i+1}: Book Index {idx} (Cosine Similarity: {cos_similarities[0][idx]:.4f})")


Mimic subsequent requests

In [None]:
# Map recs to each category

# Algebra (Actualites scientifiques et industrielles) | Mathematics / General
# College algebra, | Mathematics / General
# Fundamentals of Data Structures in C++ | Computers / General
# Finite Element Analysis: From Concepts to Applications | Mathematics / General
# Independent Component Analysis: A Tutorial Introduction (Bradford Books) | Mathematics / General
# Modern Algebra | Mathematics / General

sub_liked_books = [
    "Algebra (Actualites scientifiques et industrielles)",
    "College algebra,",
    "Fundamentals of Data Structures in C++",
    "Finite Element Analysis: From Concepts to Applications",
    "Independent Component Analysis: A Tutorial Introduction (Bradford Books)",
    "Modern Algebra"
]
sub_disliked_books = [
    "CFS traced to childhood trauma, emotional instability, stress.(Across Specialties)(chronic fatigue syndrome): An article from: Clinical Psychiatry News"
]

# Next, extract the encoded metadata of each book

sub_book_titles = book_title_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['title']).numpy()
sampled_encoded_liked_title = np.concatenate([sampled_encoded_liked_title, sub_book_titles])

sub_book_authors = book_authors_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['author']).numpy()
sampled_encoded_liked_authors = np.concatenate([sampled_encoded_liked_authors, sub_book_authors])

sub_book_genres = book_genre_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['genre_consolidated']).numpy()
sampled_encoded_liked_genres = np.concatenate([sampled_encoded_liked_genres, sub_book_genres])

#####

sub_book_titles_dis = book_title_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_disliked_books)]['title']).numpy()
sampled_encoded_disliked_title = np.concatenate([sampled_encoded_disliked_title, sub_book_titles_dis])

sub_book_authors_dis = book_authors_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_disliked_books)]['author']).numpy()
sampled_encoded_disliked_authors = np.concatenate([sampled_encoded_disliked_authors, sub_book_authors_dis])

sub_book_genres_dis = book_genre_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_disliked_books)]['genre_consolidated']).numpy()
sampled_encoded_disliked_genres = np.concatenate([sampled_encoded_disliked_genres, sub_book_genres_dis])

####

# sampled_encoded_disliked_title.extend(book_title_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['title']).numpy()
# sampled_encoded_disliked_authors.extend(book_authors_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['author']).numpy()
# sampled_encoded_disliked_genres.extend(book_genre_vocab_layer(books_df_unique[books_df_unique['title'].isin(sub_liked_books)]['genre_consolidated']).numpy()

sampled_liked_rating = [6 for title in sampled_encoded_liked_title]
    
sampled_disliked_ratings = [1 for title in sampled_encoded_disliked_title]

# sampled_encoded_liked_title, sampled_encoded_liked_authors, sampled_encoded_liked_genres, sampled_encoded_disliked_title,sampled_encoded_disliked_authors, sampled_encoded_disliked_genres, sampled_liked_rating, sampled_disliked_ratings

In [None]:
sampled_encoded_liked_title

In [None]:
sampled_encoded_disliked_title

In [None]:
sample_user_info = ({
    'user_id': [0], # Doesn't matter
    'liked_books': sampled_encoded_liked_title.tolist(),
    'disliked_books': sampled_encoded_disliked_title.tolist(),

    'liked_genres': sampled_encoded_liked_genres.tolist(),
    'disliked_genres': sampled_encoded_disliked_genres.tolist(),
    
    'liked_authors': sampled_encoded_liked_authors.tolist(),
    'disliked_authors': sampled_encoded_disliked_authors.tolist(),
    
    'liked_ratings': sampled_liked_rating,
    'disliked_ratings': sampled_disliked_ratings
})

sample_user = []

for col in sample_user_info:

    # print(col, sample_user_info[col])
    
    sample_user_info[col].extend([0]*(20 - len(sample_user_info[col])))

    # print(sample_user_info[col])
    
    sample_user.append(sample_user_info[col])

sample_user = tf.cast(sample_user, tf.float32)

In [None]:
# model.user_model.predict([[i for i in range(20)], [i for i in range(20)], [i for i in range(20)]])

user_embedding = model.user_model.predict(sample_user)

# user_embedding

In [None]:
books_df_unique.shape, filtered_df.shape

In [None]:
sub_liked_books

In [None]:
excluded_books = sub_liked_books
excluded_books.extend(sub_disliked_books)

# Assuming your DataFrame is called df and the column containing books is 'book_id'
filtered_df = books_df_unique[~books_df_unique['title'].isin(excluded_books)]

book_embeddings = np.array(filtered_books_df_unique['Embeddings'].tolist()) 

# Normalize user embedding to unit vector (L2 normalization)
# user_embedding /= np.linalg.norm(user_embedding)

# Normalize all book embeddings to unit vectors (L2 normalization)
# book_embeddings /= np.linalg.norm(book_embeddings, axis=1, keepdims=True)

# Calculate the cosine similarity between user_embedding and all book embeddings
cos_similarities = cosine_similarity(user_embedding.reshape(1, -1), book_embeddings)

# Get the indices of the top 6 closest books
top_k_indices = np.argsort(cos_similarities[0])[-20:][::-1]  # Top 6 indices with highest similarity

# Print the results (book indices and cosine similarity scores)
for i, idx in enumerate(top_k_indices):

    print(f"{filtered_books_df_unique.iloc[idx]['title']} | {filtered_books_df_unique.iloc[idx]['genre_consolidated']}")
    # print(f"Recommendation {i+1}: Book Index {idx} (Cosine Similarity: {cos_similarities[0][idx]:.4f})")


## Saving Auxilliary Files

In [None]:
book_tower = model.book_model

book_dataset = tf.data.Dataset.from_tensor_slices({
    'target_book': tf.constant(numerical_books_df['title'].tolist(), dtype=tf.int64),
    'authors': tf.constant(numerical_books_df['authors'].tolist(), dtype=tf.int64),
    'categories': tf.constant(numerical_books_df['categories'].tolist(), dtype=tf.int64),
    'description': tf.constant(numerical_books_df['description'].tolist(), dtype=tf.int64),
}).batch(128)  # Optional batching

book_embeddings = []

for batch in book_dataset:
    batch_embeddings = book_tower(batch)
    book_embeddings.append(batch_embeddings)

book_embeddings = tf.concat(book_embeddings, axis=0)
print("Book Embeddings Shape:", book_embeddings.shape)

books_df_unique['Embeddings'] = [embed.numpy() for embed in book_embeddings]

books_df_unique.shape

In [None]:
book_embeddings

In [None]:
np.save("book_embeddings.npy", book_embeddings.numpy())
books_df.to_csv('books_df.csv')

In [None]:
# Save model.tar.gz to required S3 bucket
#s3://w210recsys/model/recModel/modelFiles/
bucket_name="w210recsys"
key_prefix="model/recModel/modelFiles"
s3_response = sm_session.upload_data("book_embeddings.npy", bucket=bucket_name, key_prefix=key_prefix)

In [None]:
# Save model.tar.gz to required S3 bucket
#s3://w210recsys/model/recModel/modelFiles/
bucket_name="w210recsys"
key_prefix="model/recModel/modelFiles"
s3_response = sm_session.upload_data("books_df.csv", bucket=bucket_name, key_prefix=key_prefix)

## Test Metrics

In [None]:
user_tower = model.user_model
book_tower = model.book_model

In [None]:
@tf.function
def evaluate_model(dataset):
    return model.evaluate(dataset, return_dict=True)

metrics = evaluate_model(num_test_ds_cached)

## Save Model + Book Embeddings & Data

In [None]:
import tarfile
import os

# Save both models into separate folders
model.user_model.save("export/user_model/1")   # Save user model
#model.book_model.save("export/book_model/1")   # Save book model

# Create tar.gz archive
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("export", arcname=os.path.basename("export"))

print("✅ Both models saved and compressed successfully!")


In [None]:
import tarfile
import os

# Save both models into separate folders
model.save("export/parent_model/1")   # Save parent model

# Create tar.gz archive
with tarfile.open("model.tar.gz", "w:gz") as tar:
    tar.add("export", arcname=os.path.basename("export"))

print("✅ Both models saved and compressed successfully!")

Push the tar.gz files to the s3 bucket