# Content-Based Recommendation System using BERT

## 1. Objectives & Business Value
This notebook implements **Content-Based Recommendation Systems** leveraging **BERT embeddings** to understand product nuances (Brand, Category, Price).

### Core Components:
1.  **User-Based Content Recommender:** personalized suggestions based on a user's interaction history profile.
2.  **Brand/Item-Based Recommender:** "More like this" suggestions based on brand and price similarity.

### Business Value:
* **Discoverability:** Enables users to find alternative products within the same brand ecosystem.
* **Cold Start:** Provides recommendations for products based on metadata even if interaction data is sparse.

## 2. Setup

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# GPU Check
from tensorflow.python.client import device_lib
print(f"GPUs Available: {len(tf.config.list_physical_devices('GPU'))}")

# Set Seeds
np.random.seed(42)
tf.random.set_seed(42)

## 3. Data Loading & Feature Engineering
We load the preprocessed master dataset and generate textual features for BERT.

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print ("No of GPUS present:",len(tf.config.list_physical_devices('GPU')))


In [None]:
# Load the Clean Data
data = pd.read_csv("Master_Ecommerce_Dataset.csv")

# Fill missing values for text generation
data['brand'] = data['brand'].fillna('unknown')
data['category_code'] = data['category_code'].fillna('unknown')

# Normalize Price for Embedding
scaler = MinMaxScaler()
data['price_norm'] = scaler.fit_transform(data[['price']])


data['text'] = (
    data['brand'] + " " + 
    data['category_code'] + " price " + 
    data['price_norm'].astype(str)
)



In [None]:



# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, batch_size=64):
    all_embeddings = []
    
    # Process in batches to manage memory
    for i in range(0, len(texts), batch_size):
        batch = [str(x) for x in texts[i:i+batch_size]]
        
        inputs = tokenizer(
            batch, 
            padding="max_length", 
            truncation=True, 
            max_length=32, 
            return_tensors="tf"
        )
        
        outputs = bert_model(**inputs)
        # Extract CLS token (contextual representation)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        all_embeddings.append(cls_embeddings.numpy())
        
        if i % (batch_size * 10) == 0:
            print(f"Processed {i}/{len(texts)}", end='\r')
            
    return np.concatenate(all_embeddings, axis=0)

# Generate Embeddings
print("Generating Product Embeddings...")
unique_products = data[['product_id', 'text', 'brand', 'category_code', 'price']].drop_duplicates('product_id')
product_embeddings = get_bert_embeddings(unique_products['text'].tolist())

# Map Product ID to Embedding
product_id_to_embedding = {
    pid: emb for pid, emb in zip(unique_products['product_id'], product_embeddings)
}


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [9]:


def get_bert_embeddings(texts, tokenizer, model, batch_size=64, save_path="embeddings_chunk.npy"):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch = [str(x) for x in batch] 

        inputs = tokenizer(batch, padding="max_length", truncation=True, max_length=32, return_tensors="tf")
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        np_embeddings = cls_embeddings.numpy()

        all_embeddings.append(np_embeddings)

        # Save chunk periodically
        if i % (batch_size * 10) == 0:
            print(f"Processed {i}/{len(texts)}")
            np.save(save_path, np.concatenate(all_embeddings, axis=0))

    return tf.convert_to_tensor(np.concatenate(all_embeddings, axis=0))


## 4. BERT Embedding Generation
We use a pre-trained `bert-base-uncased` model to convert product text descriptions into dense vector embeddings.

In [None]:
# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, batch_size=64):
    all_embeddings = []
    
    # Process in batches to manage memory
    for i in range(0, len(texts), batch_size):
        batch = [str(x) for x in texts[i:i+batch_size]]
        
        inputs = tokenizer(
            batch, 
            padding="max_length", 
            truncation=True, 
            max_length=32, 
            return_tensors="tf"
        )
        
        outputs = bert_model(**inputs)
        # Extract CLS token (contextual representation)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        all_embeddings.append(cls_embeddings.numpy())
        
        if i % (batch_size * 10) == 0:
            print(f"Processed {i}/{len(texts)}", end='\r')
            
    return np.concatenate(all_embeddings, axis=0)


unique_products = data[['product_id', 'text', 'brand', 'category_code', 'price']].drop_duplicates('product_id')
product_embeddings = get_bert_embeddings(unique_products['text'].tolist())

# Map Product ID to Embedding
product_id_to_embedding = {
    pid: emb for pid, emb in zip(unique_products['product_id'], product_embeddings)
}

## 5. Model 1: User-Based Content Recommender
**Logic:** A user's profile is the average vector of all products they have viewed or purchased. We recommend products closest to this "User Vector."

**Recommendation function (user â†’ top-K products)**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_for_user(user_id, top_k):
    if user_id not in user_id_to_profile:
        print("No interaction data for this user.")
        return []

    user_vec = user_id_to_profile[user_id].reshape(1, -1)
    all_item_vecs = embeddings.numpy()

    # Compute cosine similarity
    scores = cosine_similarity(user_vec, all_item_vecs).flatten()

    # Top-K item indices
    top_indices = np.argsort(scores)[::-1][:top_k]

    # Return product_ids
    return data.iloc[top_indices][['product_id', 'brand', 'category_code', 'price', 'text']]


In [None]:
recommend_for_user(513103710,top_k=5)

Unnamed: 0,product_id,brand,category_code,price,text
29862,7900194,joie,furniture.kitchen.chair,73.1,
2661,10900336,smeg,appliances.kitchen.mixer,544.53,
1609,1005130,apple,electronics.smartphone,1558.25,
31918,1004777,xiaomi,electronics.smartphone,135.01,
29298,1307478,lenovo,computers.notebook,252.23,


## 6. Model 2: Brand & Price Based Recommender
**Logic:** Retrieve products with similar embeddings to a query product, with optional filters for price ranges and categories.

**Generate brand embeddings (same pattern as above)**

In [15]:
def get_brand_embeddings(texts, tokenizer, model, batch_size=64, save_path="brand_embeddings_chunk.npy"):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch = [str(x) for x in batch] 

        inputs = tokenizer(batch, padding="max_length", truncation=True, max_length=32, return_tensors="tf")
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        np_embeddings = cls_embeddings.numpy()

        all_embeddings.append(np_embeddings)

        # Save chunk periodically
        if i % (batch_size * 10) == 0:
            print(f"Processed {i}/{len(texts)}")
            np.save(save_path, np.concatenate(all_embeddings, axis=0))

    return tf.convert_to_tensor(np.concatenate(all_embeddings, axis=0))

In [16]:
brand_embeddings = get_brand_embeddings(list(data1['brand_text']), tokenizer, bert_model)

Processed 0/46038
Processed 640/46038
Processed 1280/46038
Processed 1920/46038
Processed 2560/46038
Processed 3200/46038
Processed 3840/46038
Processed 4480/46038
Processed 5120/46038
Processed 5760/46038
Processed 6400/46038
Processed 7040/46038
Processed 7680/46038
Processed 8320/46038
Processed 8960/46038
Processed 9600/46038
Processed 10240/46038
Processed 10880/46038
Processed 11520/46038
Processed 12160/46038
Processed 12800/46038
Processed 13440/46038
Processed 14080/46038
Processed 14720/46038
Processed 15360/46038
Processed 16000/46038
Processed 16640/46038
Processed 17280/46038
Processed 17920/46038
Processed 18560/46038
Processed 19200/46038
Processed 19840/46038
Processed 20480/46038
Processed 21120/46038
Processed 21760/46038
Processed 22400/46038
Processed 23040/46038
Processed 23680/46038
Processed 24320/46038
Processed 24960/46038
Processed 25600/46038
Processed 26240/46038
Processed 26880/46038
Processed 27520/46038
Processed 28160/46038
Processed 28800/46038
Processe

**Brand similarity / filtering function**


In [None]:
def recommend_similar_brand_price_bert(
    product_id=None,
    brand=None,
    top_k= None,
    same_category=True,
    min_price=None,
    max_price=None
):
    if product_id is not None:
        if product_id not in data1['product_id'].values:
            print("Product not found.")
            return []

        idx = data1.index.get_loc(data1.index[data1['product_id'] == product_id][0])
    
    elif brand is not None:
        brand_matches = data1[data1['brand'].str.lower() == brand.lower()]
        if brand_matches.empty:
            print("Brand not found.")
            return []

        rep_row = brand_matches.iloc[0]
        idx = data1.index.get_loc(rep_row.name)
        product_id = rep_row['product_id']
    
    else:
        print("You must provide either a product_id or a brand.")
        return []

    # Get product info
    product = data1.iloc[idx]
    product_vec = brand_embeddings[idx].numpy().reshape(1, -1)

    # Filter the dataset
    filtered_data = data1.copy()
    if same_category:
        filtered_data = filtered_data[filtered_data['category_code'] == product['category_code']]

    # price range filter
    if min_price is not None:
        filtered_data = filtered_data[filtered_data['price'] >= min_price]
    if max_price is not None:
        filtered_data = filtered_data[filtered_data['price'] <= max_price]

 
    filtered_data = filtered_data[filtered_data['product_id'] != product_id]

    # Get filtered indices
    filtered_indices = [data1.index.get_loc(i) for i in filtered_data.index]
    if not filtered_indices:
        print("No similar products found after filtering.")
        return []

    # Compute similarity
    from sklearn.metrics.pairwise import cosine_similarity
    filtered_embeddings = tf.gather(brand_embeddings, filtered_indices)
    similarities = cosine_similarity(product_vec, filtered_embeddings.numpy()).flatten()

    # Get top K
    top_indices = similarities.argsort()[::-1][:top_k]
    top_data_indices = [filtered_data.index[i] for i in top_indices]

    return data1.loc[top_data_indices][['product_id', 'brand', 'category_code', 'price']]





In [None]:
recommend_similar_brand_price_bert( brand = "wincars", top_k=10, max_price= 13) 

## 7. Conclusion
* **Performance:** BERT successfully captures semantic relationships between brands and categories (e.g., grouping high-end electronics).
* **Utility:** The User-Based model allows for personalization, while the Brand-Based model supports exploration and "similar item" widgets.