In [1]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [2]:
import os
import pprint
import tempfile
from ast import literal_eval
from typing import Dict, Text
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [3]:
books = pd.read_csv(
    "https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv",
    index_col=[0],
    converters={"genres": literal_eval, "authors": literal_eval},
)

ratings = pd.read_csv(
    "https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/ratings.csv")


In [4]:
books.columns

Index(['index', 'authors', 'average_rating', 'best_book_id', 'book_id',
       'books_count', 'description', 'genres', 'goodreads_book_id',
       'image_url', 'isbn', 'isbn13', 'language_code',
       'original_publication_year', 'original_title', 'pages', 'publishDate',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'ratings_count', 'small_image_url', 'title', 'work_id',
       'work_ratings_count', 'work_text_reviews_count', 'authors_2'],
      dtype='object')

In [None]:
books = books[
    ["book_id", "title", "description" "authors", "genres", "average_rating", "ratings_count"]
    ].copy()

In [5]:
books.head()

Unnamed: 0,index,authors,average_rating,best_book_id,book_id,books_count,description,genres,goodreads_book_id,image_url,...,ratings_3,ratings_4,ratings_5,ratings_count,small_image_url,title,work_id,work_ratings_count,work_text_reviews_count,authors_2
0,0,[Suzanne Collins],4.34,2767052,1,272,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"[young-adult, fiction, fantasy, science-fictio...",2767052,https://images.gr-assets.com/books/1447303603m...,...,560092,1481305,2706317,4780653,https://images.gr-assets.com/books/1447303603s...,"The Hunger Games (The Hunger Games, #1)",2792775,4942365,155254,['Suzanne Collins']
1,1,"[J.K. Rowling, Mary GrandPré]",4.44,3,2,491,Harry Potter's life is miserable. His parents ...,"[fantasy, fiction, young-adult, classics]",3,https://images.gr-assets.com/books/1474154022m...,...,455024,1156318,3011543,4602479,https://images.gr-assets.com/books/1474154022s...,Harry Potter and the Sorcerer's Stone (Harry P...,4640799,4800065,75867,"['J.K. Rowling', 'Mary GrandPré']"
2,2,[Stephenie Meyer],3.57,41865,3,226,About three things I was absolutely positive.\...,"[young-adult, fantasy, romance, fiction, paran...",41865,https://images.gr-assets.com/books/1361039443m...,...,793319,875073,1355439,3866839,https://images.gr-assets.com/books/1361039443s...,"Twilight (Twilight, #1)",3212258,3916824,95009,['Stephenie Meyer']
3,3,[Harper Lee],4.25,2657,4,487,The unforgettable novel of a childhood in a sl...,"[classics, fiction, historical-fiction, young-...",2657,https://images.gr-assets.com/books/1361975680m...,...,446835,1001952,1714267,3198671,https://images.gr-assets.com/books/1361975680s...,To Kill a Mockingbird,3275794,3340896,72586,['Harper Lee']
4,4,[F. Scott Fitzgerald],3.89,4671,5,1356,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[classics, fiction, historical-fiction, romance]",4671,https://images.gr-assets.com/books/1490528560m...,...,606158,936012,947718,2683664,https://images.gr-assets.com/books/1490528560s...,The Great Gatsby,245494,2773745,51992,['F. Scott Fitzgerald']


In [9]:
ratings[ratings["book_id"] == 9999]

Unnamed: 0,user_id,book_id,rating
810666,4668,9999,1
810795,927,9999,5
824153,6397,9999,3
827860,6548,9999,2
830096,2750,9999,4
...,...,...,...
5759551,22360,9999,4
5769814,13990,9999,3
5842459,24615,9999,5
5905667,47930,9999,4


In [None]:
ratings = ratings.merge(
    books[
    ["book_id", "title", "authors", "genres", "average_rating", "ratings_count"]
    ],
    on="book_id",
    how="left"
)

In [None]:
ratings.head()

In [None]:
ratings_tf = tf.data.Dataset.from_tensor_slices(ratings.to_dict("list"))
books_tf = tf.data.Dataset.from_tensor_slices(books.to_dict("list"))

**Query Tower**

User Model representing User and his reading history

In [None]:

class UserTower(tf.keras.Model):
    def __init__(self, unique_book_ids, num_tokens, embedding_matrix, 
                 user_emb_dim=32, dropout_rate=0.2):
        super(UserTower, self).__init__()

        self.book_embedding = tf.keras.Sequential([
            tf.keras.layers.IntegerLookup(
                vocabulary=unique_book_ids, 
                oov_token=0,
                mask_token=None
            ),
            tf.keras.layers.Embedding(
                input_dim=num_tokens + 1,
                output_dim=embedding_matrix.shape[1],  # 1536 for OpenAI embeddings
                embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                trainable=False,  # freeze OpenAI vectors
            )
        ])

        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=8,
            key_dim=embedding_matrix.shape[1] + 1,  # 1536 embedding + 1 rating
        )
        self.pooling = tf.keras.layers.GlobalAveragePooling1D()

        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense2 = tf.keras.layers.Dense(user_emb_dim, activation=None)

    def call(self, inputs):
        user_history = inputs["user_history"]
        history_ratings = inputs["history_ratings"]

        history_emb = self.book_embedding(user_history)  # (batch, seq_len, 1536)

        mean_rating = tf.reduce_mean(history_ratings, axis=1, keepdims=True)  # (batch, 1)
        norm_ratings = history_ratings - mean_rating  # (batch, seq_len)
        ratings_expanded = tf.expand_dims(norm_ratings, -1)  # (batch, seq_len, 1)
        concat_input = tf.concat([history_emb, ratings_expanded], axis=-1)  # (batch, seq_len, 1537)

        attn_out = self.attention(concat_input, concat_input)

        pooled = self.pooling(attn_out)

        x = self.dense1(pooled)
        x = self.dropout(x)
        user_emb = self.dense2(x)

        return user_emb


**Testing user tower**

In [None]:
if __name__ == "__main__":
    import numpy as np
    import tensorflow as tf

    data = {
        "user_history": [1, 2, 3],
        "history_ratings": [4, 5, 3],
    }
    unique_book_ids = np.unique([1, 2, 3, 4, 5])
    num_books = 5
    embedding_dim = 16
    embedding_matrix = np.random.randn(num_books + 1, embedding_dim).astype(np.float32)

    user_tower = UserTower(
        unique_book_ids=unique_book_ids,
        num_tokens=num_books,
        embedding_matrix=embedding_matrix,
    )

    example = {
        "user_history": tf.constant([data["user_history"]], dtype=tf.int32),
        "history_ratings": tf.constant([data["history_ratings"]], dtype=tf.float32),
    }

    user_embeddings = user_tower(example)

    print("\nUser embeddings shape:", user_embeddings.shape)
    print("User embeddings:", user_embeddings.numpy())



User embeddings shape: (1, 32)
User embeddings: [[ 0.06766859  0.11093511 -0.01294943  0.00491918  0.10109779 -0.03842342
   0.0119473  -0.07863134  0.0555297   0.00208682  0.09757376  0.04419123
   0.09023515  0.02224718  0.06222174 -0.01144662  0.0993228   0.03351767
  -0.07903306 -0.05718812 -0.02116466  0.01044522 -0.01023974  0.06593293
   0.05849518  0.1276843   0.04413452 -0.14104119  0.01700931 -0.04928908
  -0.051283    0.0340188 ]]


**Candidate Tower**

Book Model representing books and its features

In [24]:
class ItemTower(tf.keras.Model):
    def __init__(self, unique_book_ids, num_tokens, embedding_matrix, user_emb_dim=32, dropout_rate=0.2):
        super().__init__()
        self.vocabulary = unique_book_ids
        self.embedding_matrix = embedding_matrix

        self.book_embedding = tf.keras.Sequential([
            tf.keras.layers.IntegerLookup(
                vocabulary=unique_book_ids, 
                oov_token=0,
                mask_token=None
            ),
            tf.keras.layers.Embedding(
                input_dim=num_tokens + 1,
                output_dim=embedding_matrix.shape[1],  # 1536 for OpenAI embeddings
                embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                trainable=False,  # freeze OpenAI vectors
            )
        ])
        
        self.rating_norm = tf.keras.layers.Normalization(axis=None)
        
        self.dense1 = tf.keras.layers.Dense(128, activation="relu")
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense2 = tf.keras.layers.Dense(user_emb_dim, activation=None)

    def call(self, inputs):
        book_embbeding = self.book_embedding(inputs["book_id"])
        rating_norm = self.rating_norm(inputs["avg_rating"])
        rating_norm = tf.expand_dims(rating_norm, -1)
        concat_input = tf.concat([book_embbeding, rating_norm], axis=-1)

        x = self.dense1(concat_input)
        x = self.dropout(x)
        book_emb = self.dense2(x)
        return book_emb

**Testing candidate tower**

In [27]:
if __name__ == "__main__":
    data = {
        "book_id": [1],
        "avg_rating": [4],
    }
    unique_book_ids = np.unique([1, 2, 3, 4, 5])
    num_books = 5
    embedding_dim = 16
    embedding_matrix = np.random.randn(num_books + 1, embedding_dim).astype(np.float32)

    book_tower = ItemTower(
        unique_book_ids=unique_book_ids,
        num_tokens=num_books,
        embedding_matrix=embedding_matrix,
    )

    example = {
        "book_id": tf.constant(data["book_id"], dtype=tf.int32),   # shape (1,)
        "avg_rating": tf.constant(data["avg_rating"], dtype=tf.float32),
    }

    book_embeddings = book_tower(example)

    print("\Book embeddings shape:", book_embeddings.shape)
    print("Book embeddings:", book_embeddings.numpy())


\Book embeddings shape: (1, 32)
Book embeddings: [[-0.5214573  -0.27175915  0.7812703   0.4146685   0.21205088  0.6487028
  -1.0896047  -0.72849023  0.00453929 -0.62940025 -0.80202055 -0.35994175
   0.80723923  0.8293911  -0.42283547 -0.3236346  -0.4046202   0.1455199
  -0.76533824 -0.23459682 -0.37911275 -0.3912872   0.46213612  1.085798
   0.49305153 -1.594736   -0.5080809  -0.15446183  0.68937624  0.229936
   0.73347116  0.96787405]]


TWO TOWER MODEL

In [None]:
class BookRetrievalModel(tfrs.models.Model):
    def __init__(self, user_tower, item_tower, book):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=tf.data.Dataset.from_tensor_slices(book).batch(128).map(item_tower)
            )
        )
    def call(self, features):
        user_embeddings = self.user_tower({
            "user_history": features["user_history"],
            "history_ratings": features["history_ratings"]
        })

        book_embeddings = self.item_tower({
            "book_id": features["book_id"],
            "avg_rating": features["avg_rating"]
        })

        return user_embeddings, book_embeddings


    def compute_loss(self, features, training=False):
        user_embeddings, book_embeddings = self(features)
        return self.task(user_embeddings, book_embeddings)


Training

In [None]:
book_ids = ["b1", "b2", "b3"]
embedding_matrix = np.random.randn(len(book_ids), 4).astype(np.float32)

user_tower = UserTower(book_ids, embedding_matrix)
item_tower = ItemTower(book_ids, embedding_matrix)

model = BookRetrievalModel(user_tower, item_tower, book_ids)

train_data = tf.data.Dataset.from_tensor_slices({
    "book_history": [["b1", "b2"]],
    "history_ratings": [[5.0, 4.0]],
    "book_id": ["b3"],   # book user actually watched next
}).batch(1)

model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(train_data, epochs=5)


Inference

In [None]:
user_embedding = user_tower((tf.constant([["b1", "b2"]]), tf.constant([[5.0, 4.0]])))

item_embeddings = item_tower(tf.constant(book_ids))

scores = tf.linalg.matmul(user_embedding, item_embeddings, transpose_b=True)
top_k = tf.argsort(scores, direction="DESCENDING")
print("Top recommendations:", [book_ids[i] for i in top_k[0, :3].numpy()])
