In [None]:
from datetime import datetime

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q plotnine

In [None]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [None]:
import io
import json
import datetime

from typing import List, Union, Dict, Text
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs

import plotnine
import gdown

In [None]:
# Import books and reviews dataset
data_location = 's3://w210recsys/book_raw/books_data.csv'
review_location = 's3://w210recsys/book_raw/Books_rating.csv'

In [None]:
books_df = pd.read_csv(data_location) 
books_df.head()

In [None]:
ratings_df = pd.read_csv(review_location) 
ratings_df.head()

In [None]:
ratings_df.shape

In [None]:
print(books_df['Title'].nunique()) 
books_df.isnull().sum()

In [None]:
books_df = books_df.drop_duplicates(subset=['Title']).reset_index(drop=True)
books_df.dropna(subset=['Title'], inplace=True)
books_df.shape

In [None]:
ratings_df = ratings_df.dropna(subset=['Title', 'User-ID']).reset_index(drop=True)
ratings_df.shape

In [None]:
ratings_df['review_date'] = pd.to_datetime(ratings_df['review/time'], unit='s')
ratings_df = ratings_df.drop(columns=['review/time'])
ratings_df.head()

In [None]:
# Convert all columns titles to lower case
books_df.columns = books_df.columns.str.lower()
print(f'Books DF columns: {books_df.columns}')

ratings_df.columns = ratings_df.columns.str.lower()
print(f'Ratings DF columns: {ratings_df.columns}')

In [None]:
# Format column title
ratings_df.columns = ratings_df.columns.str.replace('/', '_')
print(f'Ratings DF columns: {ratings_df.columns}')

In [None]:
validation_start_date = (ratings_df['review_date'].max() - datetime.timedelta(days=365)).date()
validation_start_date = pd.Timestamp(validation_start_date)
validation_start_date

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
#### RAM Killer ####
# Define file paths (on Google Drive or local path)
# train_file_path = "/home/sagemaker-user/train_df.csv"
# test_file_path = "/home/sagemaker-user/test_df.csv"

train_file_path = "s3://w210recsys/book_raw/train_df.csv"
test_file_path = "s3://w210recsys/book_raw/test_df.csv"

# Check if the train/test files exist
if not os.path.exists(train_file_path) or not os.path.exists(test_file_path):
    print("Train/test split files do not exist. Creating them now...")

    # Sort the dataframe by user_id and timestamp
    ratings_df = ratings_df.sort_values(by=['user_id', 'review_date'])

    # Create train/test splits using groupby and apply with progress bar
    train_df = ratings_df.groupby('user_id').progress_apply(lambda x: x.iloc[:-1]).reset_index(drop=True)
    test_df = ratings_df.groupby('user_id').progress_apply(lambda x: x.iloc[-1:]).reset_index(drop=True)

    # Save the splits to CSV files on Google Drive
    train_df.to_csv(train_file_path, index=False)
    test_df.to_csv(test_file_path, index=False)

    print(f"Training set saved at: {train_file_path}")
    print(f"Test set saved at: {test_file_path}")
else:
    print(f"Train/test split files already exist. Loading them...")

    # Load the saved train/test splits from CSV files
    train_df = pd.read_csv(train_file_path)
    test_df = pd.read_csv(test_file_path)

# Check the sizes of the datasets
print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")


In [None]:
# Convert datasets into tensor datasets
train_ds = tf.data.Dataset.from_tensor_slices(dict(train_df[['user_id', 'title', 'review_score']]))

for x in train_ds.take(5).as_numpy_iterator():
    print(x)

print('\n')

test_ds = tf.data.Dataset.from_tensor_slices(dict(test_df[['user_id', 'title', 'review_score']]))

for x in test_ds.take(5).as_numpy_iterator():
    print(x)

In [None]:
# Create Feature Vocabularies
unique_user_ids = train_df['user_id'].unique()
unique_titles = train_df['title'].unique()
unique_review_scores = train_df['review_score'].unique()

# Candidates for retrieval Task
candidate_ds = tf.data.Dataset.from_tensor_slices(dict(
    train_df[['title']].drop_duplicates()
))

for x in candidate_ds.take(5).as_numpy_iterator():
    print(x)

In [None]:
# Cache train dataset & Candidate dataset
train_size = train_df.shape[0]
cached_train = train_ds.shuffle(train_size).batch(4096).cache()

In [None]:
# User/Query Model
class UserModel(tf.keras.Model):
    '''
    The user(query) tower
    '''

    def __init__(self,
                 unique_user_ids: np.ndarray,
                 feature_user_id_name: str,
                 embedding_dimensions: int):
        '''
        Params
        :param unique_user_ids: array of unique user ids
        :param feature_user_id_name: name of the feature
        :param embedding_dimension: number of dimensions in embedding layer
        '''
        super().__init__()
        self.feature_user_id_name = feature_user_id_name

        self.user_embedding_layers = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_user_ids,
                    mask_token=None,
                    name='user_id_vocab',
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(unique_user_ids) + 1,
                    output_dim=embedding_dimensions,
                    name='user_id_embedding',
                ),
            ],
        )

    def call(self, inputs: Dict[Text, tf.Tensor]) -> tf.Tensor:
        return self.user_embedding_layers(inputs[self.feature_user_id_name])

In [None]:
class BookModel(tf.keras.Model):
    '''
    The book(query) tower
    '''

    def __init__(self,
                 unique_titles: np.ndarray,
                 feature_book_title_name: str,
                 embedding_dimensions: int,
                 text_vectorization_max_tokens: int):
        '''
        Params
        :param unique_titles: array of unique titles
        :param unique_review_scores: array of unique review scores
        :param feature_book_title_name: name of the column title
        :param embedding_dimensions: number of dimensions in embedding layer
        :param text_vectorization_max_tokens: maximum number of tokens to vector
        '''
        super().__init__()
        self.feature_book_title_name = feature_book_title_name

        # Book Title embedding
        self.book_embedding_layers = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_titles,
                    mask_token=None,
                    name='book_id_vocab',
                ),
                tf.keras.layers.Embedding(
                    input_dim=len(unique_titles) + 1,
                    output_dim=embedding_dimensions,
                    name='book_id_embedding',
                ),
            ],
            name='book_id_embedding',
        )

    def call(self, inputs: Dict[Text, tf.Tensor]) -> tf.Tensor:
        return tf.concat([
            self.book_embedding_layers(inputs[self.feature_book_title_name]),
            # add more embedding layers as needed
        ], axis=1)

In [None]:
# # run the book model on the dataset and save the embeddings 
# book_model = BookModel(
#     unique_titles=unique_titles,
#     feature_book_title_name='title',
#     embedding_dimensions=32,
#     text_vectorization_max_tokens=10000,
# )

# book_model.compile(optimizer='adam', loss='mean_squared_error')
# book_model.fit(cached_train, epochs=1)
# # save the book embeddings
# book_embeddings = book_model.layers[0].get_weights()[0]
# book_embeddings.shape


In [None]:
class BooksTwoTowersModel(tfrs.Model):
    '''
    Two-Towers books recommender model
    '''
    def __init__(self,
                 unique_user_ids: np.ndarray,
                 unique_titles: np.ndarray,
                 unique_review_scores: np.ndarray,
                 candidate_ds: tf.data.Dataset,
                 feature_user_id_name: str = 'user_id',
                 feature_book_title_name: str = 'title',
                 feature_review_score_name: str = 'review_score',
                 embedding_dimensions: int = 64):
        '''
        Instantiate query tower, candidate tower, and retrieval task.
        '''
        super().__init__()
        self.feature_user_id_name = feature_user_id_name
        self.feature_book_title_name = feature_book_title_name
        self.feature_review_score_name = feature_review_score_name

        # Query Tower
        self.user_model = UserModel(
            unique_user_ids=unique_user_ids,
            feature_user_id_name=feature_user_id_name,
            embedding_dimensions=embedding_dimensions,
        )

        # Candidate Tower
        text_vectorization_max_tokens = len(unique_titles) + len(unique_review_scores)

        book_model_raw = BookModel(
            unique_titles=unique_titles,
            feature_book_title_name=feature_book_title_name,
            embedding_dimensions=embedding_dimensions,
            text_vectorization_max_tokens=text_vectorization_max_tokens,
        )

        # Dense projection layer to equate final tower output dims
        self.book_model = tf.keras.Sequential(
            [
                book_model_raw,
                tf.keras.layers.Dense(
                    units=embedding_dimensions,
                    name='book_dense_projection',
                ),
            ],
            name='book_sequential',
        )

        # Retrieval Task
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=candidate_ds.batch(128).map(self.book_model),
                ks=(10, 20, 50)
            )
        )

    def compute_loss(self,
                     features: Dict[Text, tf.Tensor],
                     training=False) -> tf.Tensor:
        '''
        Get embeddings for users and books.
        Compute dot product and retrieve candidates.
        '''
        user_embeddings = self.user_model({
            self.feature_user_id_name: features[self.feature_user_id_name],
        })

        book_embeddings = self.book_model({
            self.feature_book_title_name: features[self.feature_book_title_name],
        })

        # Sample weight logic
        review_scores = tf.cast(features[self.feature_review_score_name], tf.float32)
        sample_weight = tf.where(review_scores >= 4, 1.0, 0.0)

        return self.task(user_embeddings, book_embeddings, compute_metrics=not training)

In [None]:
# Setup log dir for tensorboard
LOG_DIR = "/home/sagemaker-user/logs"

if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR)

In [None]:
# Compile Model
model = BooksTwoTowersModel(
    unique_user_ids=unique_user_ids,
    unique_titles=unique_titles,
    unique_review_scores=unique_review_scores,
    candidate_ds=candidate_ds,
    embedding_dimensions=64,
)

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
# Train
model.fit(
    cached_train,
    epochs=10,
    callbacks=[tensorboard_callback],
)

In [None]:
# create BruteForce layer
index = tfrs.layers.ann.BruteForce(model.user_model)
index.index(cached_train.batch(4096).map(lambda x: x['user_id']), model.user_model)


index.index_from_dataset(tf.convert_to_tensor(unique_user_ids), model.user_model)

index.index_from_dataset(
    tf.data.Dataset.from_tensor_slices(dict(train_df[['user_id']])).batch(4096).map(lambda x: x['user_id']),
    model.user_model
)

# Get recommendations
_, titles = index(np.array(['276726']))
print(titles)

In [None]:
# Save the model.user_model as a .pkl file
import pickle

# Save the user model
with open('user_model.pkl', 'wb') as f:
    pickle.dump(model.user_model, f)

In [None]:
# Save the book embeddings
book_embeddings = model.book_model.layers[0].get_weights()[0]
book_embeddings.shape



In [None]:
test_user = pd.DataFrame({
    'user_id': test_df['user_id'].sample(1).values,
    'title': test_df['title'].sample(1).values,
    'review_score': test_df['review_score'].sample(1).values,
})

In [None]:
# Make predictions on the full test set
test_predictions = model.predict(test_ds.batch(4096))

# Compute metrics
metrics = model.evaluate(test_ds.batch(4096), return_dict=True)

# Print metrics
print(f"Metrics: {metrics}")

In [None]:
# Create a test user
test_user = pd.DataFrame({
    'user_id': [1] * len(unique_titles),
    'title': unique_titles,
    'review_score': [0] * len(unique_titles),
})


# generate user embeddings based on test_user, load the book embeddings and predict recommendations
test_user_ds = tf.data.Dataset.from_tensor_slices(dict(test_user))
test_user_embeddings = model.user_model({
    'user_id': test_user_ds.map(lambda x: x['user_id']),
})

In [None]:
# useing amazon sagemaker deploy the model
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlowModel
import boto

# Get the SageMaker execution role
role = get_execution_role()

# Get the default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Save the model
model.save("model")

# Upload the model to S3
model_path = sagemaker_session.upload_data("model", bucket, key_prefix="model")

# Create a TensorFlowModel
tensorflow_model = TensorFlowModel(
    model_data=model_path,
    role=role,
    framework_version="2.4.1",
)

# Deploy the model
predictor = tensorflow_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
)



In [None]:
# Retrieve the embeddigns
user_embeddings = model.user_model.user_embedding_layers.get_weights()[0]
book_embeddings = model.book_model.layers[0].book_embedding_layers.get_weights()[0]

# Save the embeddings
np.save('user_embeddings.npy', user_embeddings)
np.save('book_embeddings.npy', book_embeddings)

# Load the embeddings
user_embeddings = np.load('user_embeddings.npy')
book_embeddings = np.load('book_embeddings.npy')



In [None]:
class BooksRecommendation(tfrs.layers.factorized_top_k.TopK):
    def __init__(self, user_embeddings: np.ndarray, book_embeddings: np.ndarray, k: int):
        super().__init__(k=k)
        self.user_embeddings = user_embeddings
        self.book_embeddings = book_embeddings
        
    def call(self, inputs):
        user_id = inputs['user_id']
        user_embedding = tf.gather(self.user_embeddings, user_id)
        return self._call(user_embedding, self.book_embeddings)
    
    def get_config(self):
        return {'user_embeddings': self.user_embeddings, 'book_embeddings': self.book_embeddings, 'k': self.k}
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.k)
    
    def compute_mask(self, inputs, mask=None):
        return None
    
    def compute_output_signature(self, input_signature):
        return tf.TensorSpec(shape=(input_signature['user_id'].shape[0], self.k), dtype=tf.float32)
    
    def get_config(self):
        return {'user_embeddings': self.user_embeddings, 'book_embeddings': self.book_embeddings, 'k': self.k}
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.k)
    
    def compute_mask(self, inputs, mask=None):
        return None

: 

In [None]:
revrieval_model = BooksRecommendation(user_embeddings, book_embeddings, k=10)

# Get recommendations for a user
user_id = 1
user_embedding = user_embeddings[user_id]
recommended_books = revrieval_model({'user_id': user_id})
recommended_books

In [None]:
index = tfrs.layers.ann.BruteForce(user_embeddings, book_embeddings, metric='cosine')
index.index(cached_train.map(lambda x: x['title']))

# Get recommendations
_, titles = index(np.array(['276726']))
titles

In [None]:
# retrieve recommendations for top 10
recommendations = model.recommend(cached_train, k=10)
recommendations