## Content-Based Recommenders

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/Colab_Notebooks/Year_4/Recommendation_Engines/yelp_restaurant_pickles.zip" -d /content/

Archive:  /content/drive/MyDrive/Colab_Notebooks/Year_4/Recommendation_Engines/yelp_restaurant_pickles.zip
 extracting: /content/reviews_sampled.pkl  
 extracting: /content/user_sampled.pkl  
 extracting: /content/business_sampled.pkl  
 extracting: /content/checkin_sampled.pkl  
 extracting: /content/tip_sampled.pkl  


In [3]:
import pandas as pd
reviews_sampled = pd.read_pickle("reviews_sampled.pkl")

In [4]:
user_sampled = pd.read_pickle("user_sampled.pkl")

In [5]:
business_sampled = pd.read_pickle("business_sampled.pkl")

In [6]:
checkin_sampled = pd.read_pickle("checkin_sampled.pkl")

In [7]:
tip_sampled = pd.read_pickle("tip_sampled.pkl")

### BERT

In [8]:
reviews_df = pd.read_pickle("reviews_sampled.pkl")
business_df = pd.read_pickle("business_sampled.pkl")

# Get unique business IDs from reviews
business_ids = reviews_df['business_id'].unique().tolist()
print(f"Loaded {len(business_ids)} businesses")

Loaded 14030 businesses


In [9]:
aggregated_reviews = reviews_df.groupby('business_id')['text'].apply(
    lambda x: " ".join(x)[:5000]  # Truncate to avoid memory issues
)
print("Reviews aggregated for", len(aggregated_reviews), "businesses")

Reviews aggregated for 14030 businesses


In [10]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np

# lighter model used here
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded - ready for encoding!")

def batch_encode(texts, batch_size=64):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        embeddings.append(model.encode(batch, show_progress_bar=True))
        print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} batches")
    return np.vstack(embeddings)

review_texts = aggregated_reviews.tolist()
embeddings = batch_encode(review_texts)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m479.4/664.8 MB[0m [31m123.4 MB/s[0m eta [36m0:00:02[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded - ready for encoding!


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 64/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 128/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 192/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 256/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 320/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 384/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 448/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 512/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 576/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 640/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 704/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 768/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 832/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 896/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 960/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1024/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1088/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1152/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1216/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1280/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1344/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1408/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1472/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1536/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1600/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1664/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1728/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1792/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1856/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1920/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 1984/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2048/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2112/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2176/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2240/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2304/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2368/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2432/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2496/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2560/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2624/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2688/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2752/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2816/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2880/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 2944/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3008/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3072/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3136/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3200/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3264/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3328/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3392/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3456/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3520/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3584/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3648/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3712/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3776/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3840/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3904/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 3968/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4032/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4096/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4160/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4224/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4288/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4352/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4416/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4480/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4544/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4608/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4672/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4736/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4800/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4864/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4928/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 4992/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5056/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5120/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5184/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5248/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5312/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5376/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5440/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5504/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5568/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5632/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5696/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5760/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5824/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5888/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 5952/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6016/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6080/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6144/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6208/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6272/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6336/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6400/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6464/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6528/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6592/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6656/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6720/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6784/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6848/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6912/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 6976/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7040/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7104/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7168/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7232/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7296/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7360/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7424/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7488/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7552/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7616/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7680/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7744/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7808/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7872/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 7936/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8000/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8064/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8128/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8192/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8256/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8320/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8384/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8448/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8512/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8576/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8640/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8704/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8768/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8832/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8896/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 8960/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9024/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9088/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9152/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9216/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9280/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9344/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9408/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9472/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9536/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9600/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9664/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9728/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9792/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9856/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9920/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 9984/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10048/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10112/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10176/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10240/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10304/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10368/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10432/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10496/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10560/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10624/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10688/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10752/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10816/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10880/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 10944/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11008/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11072/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11136/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11200/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11264/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11328/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11392/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11456/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11520/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11584/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11648/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11712/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11776/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11840/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11904/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 11968/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12032/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12096/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12160/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12224/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12288/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12352/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12416/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12480/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12544/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12608/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12672/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12736/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12800/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12864/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12928/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 12992/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13056/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13120/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13184/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13248/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13312/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13376/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13440/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13504/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13568/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13632/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13696/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13760/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13824/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13888/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 13952/14030 batches


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 14016/14030 batches


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 14030/14030 batches


In [11]:
embeddings_df = pd.DataFrame(embeddings, index=aggregated_reviews.index)
print("Embeddings shape:", embeddings_df.shape)

Embeddings shape: (14030, 384)


### Similarities

#### **Similarity Matrix**

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
bert_sim_matrix = cosine_similarity(embeddings_df)
bert_sim_df = pd.DataFrame(
    bert_sim_matrix,
    index=embeddings_df.index,
    columns=embeddings_df.index
)
print("BERT similarity matrix computed!")

BERT similarity matrix computed!


#### **Coverage**

In [14]:
def calculate_bert_coverage(threshold=0.5):
    coverage = (bert_sim_df.max(axis=1) > threshold).mean() * 100
    print(f"BERT Coverage (@{threshold}): {coverage:.1f}%")
    return coverage

# Calculate at different thresholds
coverage_results = {
    '0.3': calculate_bert_coverage(0.3),
    '0.5': calculate_bert_coverage(0.5),
    '0.7': calculate_bert_coverage(0.7)
}

BERT Coverage (@0.3): 100.0%
BERT Coverage (@0.5): 100.0%
BERT Coverage (@0.7): 100.0%


#### **Diversity**

In [15]:
def calculate_bert_diversity(top_n=5):
    """Measure how dissimilar top recommendations are to each other"""
    top_recs = bert_sim_df.apply(lambda x: x.nlargest(top_n+1).index[1:], axis=1)
    diversities = []
    for biz in top_recs.index:
        rec_ids = top_recs[biz]
        sub_matrix = bert_sim_df.loc[rec_ids, rec_ids]
        diversities.append(1 - sub_matrix.values.mean()) # 1 - average similarity
    return np.mean(diversities)

print(f"\nBERT Diversity Score: {calculate_bert_diversity():.3f} (1 = most diverse)")



BERT Diversity Score: 0.254 (1 = most diverse)


#### **Recommendation Function**

In [16]:
def get_bert_recommendations(business_id, top_n=5, business_df=business_df):
    """Get top_n recommendations for a business using BERT embeddings"""
    if business_id not in bert_sim_df.index:
        return pd.DataFrame()  # Return empty if business not found

    sim_scores = bert_sim_df[business_id].sort_values(ascending=False)[1:top_n+1]
    recommendations = business_df.loc[sim_scores.index, ['name', 'categories', 'city', 'stars']]
    recommendations['similarity_score'] = sim_scores.values
    return recommendations

In [17]:
example_biz = business_df.index[0]  # Replace with any business ID
print(f"\nExample recommendations for business: {business_df.loc[example_biz, 'name']}")
display(get_bert_recommendations(example_biz))


Example recommendations for business: St Honore Pastries
