<a href="https://colab.research.google.com/github/minhaj-mhd/two-tower-recommedation/blob/main/two_tower_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
print("⏳ Installing and upgrading all required packages...")

%pip install --upgrade -q tensorflow tensorflow-recommenders tf-keras tensorflow-text
%pip install -q faiss-cpu

print("\n✅ All packages have been installed and upgraded.")

⏳ Installing and upgrading all required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.11.0 requires tensorflow=

In [3]:
%pip install --upgrade -q tensorflow-decision-forests


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tf_keras
import faiss
import tensorflow_text as tf_text
import tensorflow_decision_forests as tfdf

print(f"tensorflow: {tf.__version__}")
print(f"tensorflow-recommenders: {tfrs.__version__}")
print(f"tf-keras: {tf_keras.__version__}")
print(f"faiss-cpu: {faiss.__version__}")
print(f"tensorflow-text: {tf_text.__version__}")
print(f"tensorflow-decision-forests: {tfdf.__version__}")

ImportError: cannot import name 'float8_e4m3b11fnuz' from 'tensorflow.python.framework.dtypes' (/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/dtypes.py)

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.layers import TextVectorization
import faiss
from collections import defaultdict

# --- Step 1: Fabricate Data with Category-based User Behavior ---
print("[1] Fabricating data with category-based user preferences...")
categories = ["gadget", "apparel", "book", "tool", "toy", "utensil"]
num_items = 5000
num_users = 500

# Create items with explicit category tracking
item_titles = [f"Product {i}" for i in range(num_items)]
item_categories = [categories[i % len(categories)] for i in range(num_items)]
description_templates = [
    lambda i, cat: f"High-quality, durable {cat} for all your needs. Model v{i % 10}. Made from premium materials.",
    lambda i, cat: f"An affordable and reliable {cat}. Perfect for beginners. Item #{i}.",
    lambda i, cat: f"The ultimate professional-grade {cat}. Features advanced technology. SKU {i}.",
]
item_descriptions = [description_templates[i % 3](i, item_categories[i]) for i in range(num_items)]

items_data = {
    "item_id": [str(i) for i in range(num_items)],
    "item_title": item_titles,
    "item_description": item_descriptions,
    "category": item_categories
}
items_df = pd.DataFrame(items_data)

# Create category-to-items mapping for easier lookup
category_to_items = defaultdict(list)
for idx, row in items_df.iterrows():
    category_to_items[row['category']].append(row['item_id'])

# Generate user interactions: each user interacts with exactly 2 categories
print("Generating user interactions with category preferences...")
user_interactions = []
user_categories = {}  # Track which categories each user prefers

for user_id in range(num_users):
    # Each user randomly selects 2 categories
    preferred_categories = np.random.choice(categories, size=2, replace=False)
    user_categories[str(user_id)] = preferred_categories

    # Generate 10 interactions for this user (5 from each category)
    for category in preferred_categories:
        # Select 5 random items from this category
        available_items = category_to_items[category]
        selected_items = np.random.choice(available_items, size=5, replace=True)

        for item_id in selected_items:
            user_interactions.append({
                "user_id": str(user_id),
                "item_id": item_id
            })

interactions_df = pd.DataFrame(user_interactions)
print(f"Generated {len(items_df)} items and {len(interactions_df)} interactions.")
print(f"Each user interacts with exactly 2 categories out of {len(categories)} total categories.")

# Display some user preferences for verification
print("\nSample user category preferences:")
for i in range(5):
    user_id = str(i)
    print(f"User {user_id}: {user_categories[user_id]}")

items_ds = tf.data.Dataset.from_tensor_slices(dict(items_df))

# --- Step 2: Self-Supervised Item Tower ---
print("\n[2] Building and training the self-supervised Item Tower...")
embedding_dimension = 32
max_tokens = 10_000
sequence_length = 100

text_vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=sequence_length)
text_vectorizer.adapt(items_ds.map(lambda x: x["item_description"]).batch(128))

class ItemModel(tf.keras.Model):
    def __init__(self, vectorizer):
        super().__init__()
        self.vectorizer = vectorizer
        self.embedding = tf.keras.Sequential([
            self.vectorizer,
            tf.keras.layers.Embedding(input_dim=self.vectorizer.vocabulary_size(), output_dim=embedding_dimension, mask_zero=True),
            tf.keras.layers.GlobalAveragePooling1D(),
        ])
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(embedding_dimension)
        ])
    def call(self, inputs):
        return self.dense(self.embedding(inputs["item_description"]))

class SelfSupervisedItemTwoTower(tfrs.Model):
    def __init__(self, item_model):
        super().__init__()
        self.item_model = item_model
        self.task = tfrs.tasks.Retrieval()
    def compute_loss(self, features, training=False):
        item_embeddings = self.item_model(features)
        return self.task(query_embeddings=item_embeddings, candidate_embeddings=item_embeddings)

item_tower = ItemModel(text_vectorizer)
item_model_trainer = SelfSupervisedItemTwoTower(item_tower)
item_model_trainer.compile(optimizer=tf.keras.optimizers.Adagrad(0.05))
train_item_ds = items_ds.map(lambda x: {"item_description": x["item_description"]}).batch(256).cache()
item_model_trainer.fit(train_item_ds, epochs=5)
print("Item Tower training complete.")

# --- Step 3: Generate and Store Item Embeddings in Faiss ---
print("\n[3] Generating item embeddings and storing in Faiss...")
index = faiss.IndexFlatL2(embedding_dimension)
item_embeddings_generator = items_ds.batch(256).map(lambda x: item_tower(x))
all_item_embeddings = np.concatenate(list(item_embeddings_generator.as_numpy_iterator()))
index.add(all_item_embeddings)
print(f"Faiss index now contains {index.ntotal} vectors.")
index_to_item_id = {i: item_id for i, item_id in enumerate(items_df["item_id"])}

# --- Step 4: Train the User Tower ---
print("\n[4] Building and training the User Tower...")
unique_user_ids = interactions_df["user_id"].unique()

class UserModel(tf.keras.Model):
    def __init__(self, user_ids):
        super().__init__()
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(user_ids) + 1, embedding_dimension)
        ])
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(embedding_dimension)
        ])
    def call(self, inputs):
        return self.dense(self.user_embedding(inputs))

class UserItemRetrievalModel(tfrs.Model):
    def __init__(self, user_model, item_model):
        super().__init__()
        self.user_model = user_model
        self.item_model = item_model
        self.item_model.trainable = False  # Keep item model frozen

        # Simple retrieval task
        self.task = tfrs.tasks.Retrieval()

    def compute_loss(self, data, training=False):
        user_embeddings = self.user_model(data["user_id"])

        # Get item embeddings for the interacted items
        item_data = {"item_description": data["item_description"]}
        item_embeddings = self.item_model(item_data)

        return self.task(
            query_embeddings=user_embeddings,
            candidate_embeddings=item_embeddings
        )

# Prepare training data with item descriptions and categories
interactions_with_details_df = pd.merge(
    interactions_df,
    items_df[['item_id', 'item_description', 'category']],
    on='item_id'
)
full_interactions_ds = tf.data.Dataset.from_tensor_slices(dict(interactions_with_details_df))
train_ds_user = full_interactions_ds.shuffle(10_000).batch(256).cache()

user_tower = UserModel(unique_user_ids)
user_model_trainer = UserItemRetrievalModel(user_tower, item_tower)
user_model_trainer.compile(optimizer=tf.keras.optimizers.Adagrad(0.05))

# Train the user model
user_model_trainer.fit(train_ds_user, epochs=5)
print("User Tower training complete.")

# --- Step 5: Serve and Validate Recommendations ---
print("\n[5] Serving and validating recommendations...")

def get_recommendations_with_validation(user_id, top_k=10):
    print(f"\n--- Getting recommendations for user '{user_id}' ---")

    if user_id not in unique_user_ids:
        print(f"User '{user_id}' is a new user (cold start).")
        return

    # Get user's preferred categories
    preferred_categories = user_categories[user_id]
    print(f"User's preferred categories: {preferred_categories}")

    # Get user embedding and find similar items
    user_embedding = user_tower(tf.constant([user_id])).numpy()
    distances, indices = index.search(user_embedding, top_k)

    print(f"Top {top_k} recommendations:")
    category_counts = defaultdict(int)

    for i, idx in enumerate(indices[0]):
        item_id = index_to_item_id[idx]
        item_row = items_df[items_df['item_id'] == item_id].iloc[0]
        item_title = item_row['item_title']
        item_category = item_row['category']
        category_counts[item_category] += 1

        # Mark if recommendation matches user's preferences
        is_preferred = "✓" if item_category in preferred_categories else "✗"
        print(f"  {i+1}. {is_preferred} Item ID: {item_id} | Category: {item_category} | Title: '{item_title}' (Distance: {distances[0][i]:.4f})")

    # Calculate recommendation accuracy
    correct_recommendations = sum(category_counts[cat] for cat in preferred_categories)
    accuracy = correct_recommendations / top_k

    print(f"\n--- Recommendation Analysis ---")
    print(f"Category distribution in recommendations:")
    for category, count in category_counts.items():
        percentage = (count / top_k) * 100
        is_preferred = "✓" if category in preferred_categories else "✗"
        print(f"  {is_preferred} {category}: {count}/{top_k} ({percentage:.1f}%)")

    print(f"Accuracy: {correct_recommendations}/{top_k} ({accuracy:.1%}) recommendations match user preferences")

    return accuracy

# Test recommendations for several users
print("\n" + "="*60)
print("TESTING RECOMMENDATION ACCURACY")
print("="*60)

accuracies = []
test_users = ["0", "1", "2", "3", "4", "10", "25", "50"]

for user_id in test_users:
    if user_id in unique_user_ids:
        accuracy = get_recommendations_with_validation(user_id)
        if accuracy is not None:
            accuracies.append(accuracy)

if accuracies:
    avg_accuracy = np.mean(accuracies)
    print(f"\n" + "="*60)
    print(f"OVERALL RESULTS")
    print(f"="*60)
    print(f"Average recommendation accuracy: {avg_accuracy:.1%}")
    print(f"Tested {len(accuracies)} users")

    if avg_accuracy > 0.7:
        print("✓ Good performance! Most recommendations match user preferences.")
    elif avg_accuracy > 0.5:
        print("~ Moderate performance. Some recommendations match user preferences.")
    else:
        print("✗ Poor performance. Few recommendations match user preferences.")

[1] Fabricating data with category-based user preferences...
Generating user interactions with category preferences...
Generated 5000 items and 5000 interactions.
Each user interacts with exactly 2 categories out of 6 total categories.

Sample user category preferences:
User 0: ['utensil' 'gadget']
User 1: ['book' 'apparel']
User 2: ['tool' 'toy']
User 3: ['toy' 'tool']
User 4: ['toy' 'tool']

[2] Building and training the self-supervised Item Tower...
Epoch 1/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 1313.1624 - regularization_loss: 0.0000e+00 - total_loss: 1313.1624
Epoch 2/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 1005.8630 - regularization_loss: 0.0000e+00 - total_loss: 1005.8630
Epoch 3/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 968.2358 - regularization_loss: 0.0000e+00 - total_loss: 968.2358  
Epoch 4/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 