<a href="https://colab.research.google.com/github/minhaj-mhd/30-Days-Of-React/blob/master/two_tower_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
print("⏳ Installing and upgrading all required packages...")

%pip install --upgrade -q tensorflow tensorflow-recommenders tf-keras tensorflow-text
%pip install -q faiss-cpu

print("\n✅ All packages have been installed and upgraded.")

⏳ Installing and upgrading all required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.11.0 requires tensorflow=

In [4]:
%pip install --upgrade -q tensorflow-decision-forests


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tf_keras
import faiss
import tensorflow_text as tf_text
import tensorflow_decision_forests as tfdf

print(f"tensorflow: {tf.__version__}")
print(f"tensorflow-recommenders: {tfrs.__version__}")
print(f"tf-keras: {tf_keras.__version__}")
print(f"faiss-cpu: {faiss.__version__}")
print(f"tensorflow-text: {tf_text.__version__}")
print(f"tensorflow-decision-forests: {tfdf.__version__}")

tensorflow: 2.19.0
tensorflow-recommenders: v0.7.3
tf-keras: 2.19.0
faiss-cpu: 1.11.0
tensorflow-text: 2.19.0
tensorflow-decision-forests: 1.12.0


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.layers import TextVectorization
import faiss

# --- Step 1: Fabricate Data ---
print("[1] Fabricating data...")
categories = ["gadget", "apparel", "book", "tool", "toy", "utensil"]
num_items = 5000
item_titles = [f"Product {i}" for i in range(num_items)]
description_templates = [
    lambda i: f"High-quality, durable {categories[i % len(categories)]} for all your needs. Model v{i % 10}. Made from premium materials.",
    lambda i: f"An affordable and reliable {categories[i % len(categories)]}. Perfect for beginners. Item #{i}.",
    lambda i: f"The ultimate professional-grade {categories[i % len(categories)]}. Features advanced technology. SKU {i}.",
]
item_descriptions = [description_templates[i % 3](i) for i in range(num_items)]
items_data = {
    "item_id": [str(i) for i in range(num_items)],
    "item_title": item_titles,
    "item_description": item_descriptions
}
items_df = pd.DataFrame(items_data)

num_users = 500
num_interactions = 20000
users_data = {
    "user_id": [str(np.random.randint(0, num_users)) for _ in range(num_interactions)],
    "item_id": [str(np.random.randint(0, num_items)) for _ in range(num_interactions)],
}
interactions_df = pd.DataFrame(users_data)
print(f"Generated {len(items_df)} items and {len(interactions_df)} interactions.")
items_ds = tf.data.Dataset.from_tensor_slices(dict(items_df))

# --- Step 2: Self-Supervised Item Tower ---
print("\n[2] Building and training the self-supervised Item Tower...")
embedding_dimension = 32
max_tokens = 10_000
sequence_length = 100

text_vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=sequence_length)
text_vectorizer.adapt(items_ds.map(lambda x: x["item_description"]).batch(128))

class ItemModel(tf.keras.Model):
    def __init__(self, vectorizer):
        super().__init__()
        self.vectorizer = vectorizer
        self.embedding = tf.keras.Sequential([
            self.vectorizer,
            tf.keras.layers.Embedding(input_dim=self.vectorizer.vocabulary_size(), output_dim=embedding_dimension, mask_zero=True),
            tf.keras.layers.GlobalAveragePooling1D(),
        ])
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(embedding_dimension)
        ])
    def call(self, inputs):
        return self.dense(self.embedding(inputs["item_description"]))

class SelfSupervisedItemTwoTower(tfrs.Model):
    def __init__(self, item_model):
        super().__init__()
        self.item_model = item_model
        self.task = tfrs.tasks.Retrieval()
    def compute_loss(self, features, training=False):
        item_embeddings = self.item_model(features)
        return self.task(query_embeddings=item_embeddings, candidate_embeddings=item_embeddings)

item_tower = ItemModel(text_vectorizer)
item_model_trainer = SelfSupervisedItemTwoTower(item_tower)
item_model_trainer.compile(optimizer=tf.keras.optimizers.Adagrad(0.05))
train_item_ds = items_ds.map(lambda x: {"item_description": x["item_description"]}).batch(256).cache()
item_model_trainer.fit(train_item_ds, epochs=5)
print("Item Tower training complete.")

# --- Step 3: Generate and Store Item Embeddings in Faiss ---
print("\n[3] Generating item embeddings and storing in Faiss...")
index = faiss.IndexFlatL2(embedding_dimension)
item_embeddings_generator = items_ds.batch(256).map(lambda x: item_tower(x))
all_item_embeddings = np.concatenate(list(item_embeddings_generator.as_numpy_iterator()))
index.add(all_item_embeddings)
print(f"Faiss index now contains {index.ntotal} vectors.")
index_to_item_id = {i: item_id for i, item_id in enumerate(items_df["item_id"])}

# --- Step 4: Train the User Tower ---
print("\n[4] Building and training the User Tower...")
unique_user_ids = np.unique(interactions_df["user_id"])

# Pre-compute all item embeddings to use as candidates for metrics
print("Pre-computing all item embeddings into a single tensor for the candidate set...")
# Convert to NumPy array before passing to FactorizedTopK
all_item_embeddings_np = np.concatenate(list(items_ds.batch(256).map(item_tower).as_numpy_iterator()))
print(f"Candidate embeddings numpy array shape: {all_item_embeddings_np.shape}")
item_ids_tensor = tf.constant(item_titles)
item_embeddings_tensor = tf.convert_to_tensor(all_item_embeddings_np, dtype=tf.float32)
candidate_dataset = items_ds.map(lambda x: (x["item_id"], {"item_description": x["item_description"]}))
class UserModel(tf.keras.Model):
    def __init__(self, user_ids):
        super().__init__()
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(user_ids) + 1, embedding_dimension)
        ])
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(embedding_dimension)
        ])
    def call(self, inputs):
        return self.dense(self.user_embedding(inputs))

class UserItemRetrievalModel(tfrs.Model):
    def __init__(self, user_model, item_model, candidate_dataset):
        super().__init__()
        self.user_model = user_model
        self.item_model = item_model
        self.item_model.trainable = False

        # Pass the pre-computed item embeddings directly to FactorizedTopK
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=candidate_dataset,
                                                ks=[1, 5, 10])
        )

    def compute_loss(self, data, training=False):
        user_embeddings = self.user_model(data["user_id"])
        item_embeddings = self.item_model({"item_description": data["item_description"]})
        # Pass the true item embeddings and IDs from the batch to the task for loss and metric calculation
        return self.task(
            query_embeddings=user_embeddings,
            candidate_embeddings=item_embeddings,
            candidate_ids=data["item_id"]
        )


interactions_with_desc_df = pd.merge(interactions_df, items_df[['item_id', 'item_description']], on='item_id')
full_interactions_ds = tf.data.Dataset.from_tensor_slices(dict(interactions_with_desc_df))
train_ds_user = full_interactions_ds.shuffle(10_000).batch(256).cache()

user_tower = UserModel(unique_user_ids)
# Pass the pre-computed item embeddings (NumPy array) for metrics
user_model_trainer = UserItemRetrievalModel(user_tower, item_tower, candidate_dataset)
user_model_trainer.compile(optimizer=tf.keras.optimizers.Adagrad(0.05))
user_model_trainer.fit(train_ds_user, epochs=5)
print("User Tower training complete.")

# --- Step 5: Serve Recommendations ---
print("\n[5] Serving recommendations...")
def get_recommendations(user_id, top_k=5):
    print(f"\n--- Getting recommendations for user '{user_id}' ---")
    if user_id not in unique_user_ids:
        print(f"User '{user_id}' is a new user (cold start).")
        random_user = np.random.choice(unique_user_ids)
        print(f"Simulating recommendations based on a similar user: '{random_user}'")
        user_id = random_user

    user_embedding = user_tower(tf.constant([user_id])).numpy()
    distances, indices = index.search(user_embedding, top_k)
    print(f"Top {top_k} recommendations:")
    for i, idx in enumerate(indices[0]):
        item_id = index_to_item_id[idx]
        item_title = items_df[items_df['item_id'] == item_id]['item_title'].values[0]
        print(f"  {i+1}. Item ID: {item_id} | Title: '{item_title}' (Distance: {distances[0][i]:.4f})")

get_recommendations("10")
get_recommendations("123")
get_recommendations("9999")

[1] Fabricating data...
Generated 5000 items and 20000 interactions.

[2] Building and training the self-supervised Item Tower...
Epoch 1/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - loss: 1200.0494 - regularization_loss: 0.0000e+00 - total_loss: 1200.0494
Epoch 2/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 1047.5115 - regularization_loss: 0.0000e+00 - total_loss: 1047.5115
Epoch 3/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 975.6721 - regularization_loss: 0.0000e+00 - total_loss: 975.6721  
Epoch 4/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 963.3029 - regularization_loss: 0.0000e+00 - total_loss: 963.3029  
Epoch 5/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 962.1260 - regularization_loss: 0.0000e+00 - total_loss: 962.1260
Item Tower training complete.

[3] Generating item embeddings and storin

ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 