In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_recommenders as tfrs
import sklearn 
import os
import warnings
import re
from unidecode import unidecode
from typing import Dict, Text, Any

from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

warnings.filterwarnings('ignore')
yelp_reviews = pd.read_csv('cleaned_yelp_reviews.csv', encoding='UTF-8')
yelp_restaurants = pd.read_csv('cleaned_yelp_restaurants.csv', encoding='UTF-8')

user_favorites = pd.read_csv('user_favorite_restaurants.csv', encoding='UTF-8')

In [2]:
def normalize_restaurant_names(df, column_name='restaurant_name'):
    def normalize(name):
        name = re.sub(r"\(.*?\)", "", name)  #remove text in parentheses
        name = name.lower().strip()  #convert to lower case and strip whitespaces
        name = unidecode(name)  #remove accents and special characters
        name = name.replace(' ', '_')  #replace spaces with underscores
        return name
    df[column_name] = df[column_name].apply(normalize)

#apply normalization to restaurant names
normalize_restaurant_names(user_favorites, 'restaurant_name')
normalize_restaurant_names(yelp_restaurants, 'restaurant_name')

In [3]:
# Create a mapping dictionary from restaurant names -> IDs from Yelp dataset
name_to_id = yelp_restaurants.set_index('restaurant_name')['restaurant_id'].to_dict()
# Add a new column for restaurant_id in users's favorites using the mapping
user_favorites['restaurant_id'] = user_favorites['restaurant_name'].map(name_to_id)

In [4]:
#redo merge using a left join on restaurant_name
merged_df = pd.merge(yelp_restaurants, user_favorites, on='restaurant_name', how='left', suffixes=('_yelp', '_user'))

# Check for any unmatched restaurants and decide on a strategy for them
# For example, we could fill NaNs with a default rating or exclude these from the dataset
# Assume that a missing user rating means the user has not rated the restaurant
merged_df['user_rating'] = merged_df['user_rating'].fillna(0)
#rename user_email to user_id
merged_df.rename(columns={'user_email': 'user_id'}, inplace=True)

#clean the user_price_rating and price columns
merged_df['user_price_rating'] = merged_df['user_price_rating'].str.split('(').str[0]
merged_df['user_price_rating'] = merged_df['user_price_rating'].str.len()
merged_df['user_price_rating'] = merged_df['user_price_rating'].fillna(0)
merged_df['user_price_rating'] = merged_df['user_price_rating'].astype(float)

# merged_df['user_price_rating'].value_counts()
merged_df['price'] = merged_df['price'].str.len()
merged_df['price'] = merged_df['price'].fillna(0)
merged_df['price'] = merged_df['price'].astype(float)

#clean the neighborhood column
merged_df['neighborhood'] = merged_df['neighborhood'].fillna('Unknown')
merged_df['neighborhood'] = merged_df['neighborhood'].astype(str)


#clean the user_occasion column
# merged_df['user_occasion'] = merged_df['user_occasion'].astype(str)
# merged_df['user_occasion'] = merged_df['user_occasion'].fillna('Unknown')


In [5]:
df = merged_df[['restaurant_id_yelp','user_id', 'cuisine_0','user_rating', 'rating', 'price', 'neighborhood']]
#rename cuisine_0 to cuisine and rating to yelp_rating
df.rename(columns={'cuisine_0': 'cuisine', 'rating': 'yelp_rating', 'restaurant_id_yelp': 'restaurant_id'}, inplace=True)
#fill missing values in user_id with N/A
df['user_id'] = df['user_id'].fillna('N/A')
#make user_id, restaurant_id, and cuisine strings
df.rename(columns={'user_id': 'user_id', 'restaurant_id': 'restaurant_id', 'cuisine': 'cuisine'}, inplace=True)
df

Unnamed: 0,restaurant_id,user_id,cuisine,user_rating,yelp_rating,price,neighborhood
0,lQKnVAB9E4mhVlJc6sxwVQ,,Cuban,0.0,3.5,2.0,Unknown
1,eqTk_mQPCxZl9dnc-kDQgg,,Haitian,0.0,4.0,0.0,Northwest Brooklyn
2,n2Dddm0GzlrIkmg56fV-fg,,Bakeries,0.0,4.5,1.0,Southeast Queens
3,SB4crLLqZy4RdoA3e7yDNA,,Coffee & Tea,0.0,4.0,0.0,Northwest Brooklyn
4,LsPlBRWerGGKC-0cWu46Lw,,Sandwiches,0.0,5.0,0.0,Northwest Brooklyn
...,...,...,...,...,...,...,...
7342,Ew3d7Vt18c1MH4_X6ogxQQ,,Vietnamese,0.0,4.3,2.0,Lower East Side
7343,FFCZixTxhlX1YmXkj48iRw,,Vietnamese,0.0,3.6,0.0,Chelsea and Clinton
7344,HOZsJkXLEWMy03TOMaH0jQ,,Vietnamese,0.0,3.6,2.0,Chelsea and Clinton
7345,feY7PgoRjCp2qReaIitRkQ,,Thai,0.0,4.4,2.0,Northwest Brooklyn


In [6]:
interactions_dict = df.groupby(['user_id',
                                'cuisine',
                                'restaurant_id',
                                'price',
                                'neighborhood'])['user_rating'].sum().reset_index() 
#tranform the table into a dictionary
interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

items_dict = df[['restaurant_id']].drop_duplicates()
items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)

In [7]:
interactions = interactions.map(lambda x: {
    'user_id' : x['user_id'], 
    'restaurant_id' : x['restaurant_id'], 
    'cuisine' : x['cuisine'],
    'neighborhood' : x['neighborhood'],
    'price' : float(x['price']), #price rating from yelp
    'user_rating' : float(x['user_rating'])
})

items = items.map(lambda x: x['restaurant_id'])
cuisine = interactions.map(lambda x: x['cuisine'])
neighborhood = interactions.map(lambda x: x['neighborhood'])

unique_restaurant_ids = np.unique([x['restaurant_id'].numpy() for x in interactions])
unique_user_ids = np.unique([x['user_id'].numpy() for x in interactions])
unique_neighborhoods = np.unique([x['neighborhood'].numpy() for x in interactions])
unique_cuisines = np.unique([x['cuisine'].numpy() for x in interactions])

In [8]:
#train/test split(75/25)
tf.random.set_seed(55)

buffer_size = len(interactions)  
shuffled = interactions.shuffle(buffer_size, seed=55, reshuffle_each_iteration=False)

total_size = len(interactions)  
train_size = int(total_size * 0.75)
test_size = total_size - train_size

train = shuffled.take(train_size)
test = shuffled.skip(train_size) 

# Multi-task Recommender Model

In [9]:
#combine the ranking and retrieval models into a single multi-task model

class YelpRecModel(tfrs.models.Model): 
    def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
        super().__init__()
        
        embedding_dimension = 32
        
        #user and restaurant models(compute user and restaurant embeddings)
        self.restaurant_model = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_restaurant_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_restaurant_ids) + 1, embedding_dimension)
        ])
        self.user_model = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])
        #a small sequential model to predict ratings
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(1)
            # tf.keras.layers.Dense(1, activation='sigmoid') #ensure the output is between 0 and 1(so we can scale to 0-5)
        ])
        #define ranking and retrieval tasks
        self.rating_task = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
        self.retrieval_task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=items.batch(128).map(self.restaurant_model)
            )
        )
        #loss weights
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight
        
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        #pick user features and pass them through the user model
        user_embeddings = self.user_model(features['user_id'])
        #pick restaurant features and pass them through the restaurant model
        restaurant_embeddings = self.restaurant_model(features['restaurant_id'])

        return (
            user_embeddings,
            restaurant_embeddings,
            self.rating_model(tf.concat([user_embeddings, restaurant_embeddings], axis=1))  
        )
        
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        
        ratings = features.pop('user_rating')
        print(features)
        user_embeddings, restaurant_embeddings, rating_predictions = self(features)
        
        #confirm we are calculating loss based on user_rating
        # tf.print("Debugging Loss Computation:")
        # tf.print("Labels (User Ratings):", ratings)
        # tf.print("Predictions:", rating_predictions)
        
        #compute the loss for each task
        rating_loss = self.rating_task(
            # labels=features['user_rating'],
            labels = ratings,
            predictions=rating_predictions
        )
        
        retrieval_loss = self.retrieval_task(user_embeddings, restaurant_embeddings)
        
        #combine the loss using the loss weights
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

In [10]:
model = YelpRecModel(rating_weight=0.3, retrieval_weight=0.7)
model.compile(optimizer=tf.keras.optimizers.legacy.Adagrad(0.1))

cached_train = train.shuffle(buffer_size).batch(8196).cache()
cached_test = test.batch(4098).cache()

model.fit(cached_train, epochs=100)
metrics = model.evaluate(cached_test, return_dict=True)
print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/100
{'user_id': <tf.Tensor 'IteratorGetNext:4' shape=(None,) dtype=string>, 'restaurant_id': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'cuisine': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'neighborhood': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'price': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=float32>}
{'user_id': <tf.Tensor 'IteratorGetNext:4' shape=(None,) dtype=string>, 'restaurant_id': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'cuisine': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=string>, 'neighborhood': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'price': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=float32>}
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoc

In [11]:
restaurant_info_df = df[['restaurant_id', 'cuisine', 'price', 'neighborhood']].drop_duplicates()
restaurant_info = restaurant_info_df.set_index('restaurant_id').T.to_dict('dict')

In [12]:
#get the value_counts of cuisines so we can use it to filter the recommendations
cuisine_counts = df['cuisine'].value_counts()
cuisine_counts.head(30)

cuisine
Italian               368
Chinese               260
Japanese              247
New American          247
Mexican               243
Sushi Bars            223
Indian                167
Pizza                 165
Mediterranean         160
French                159
Thai                  150
Bakeries              143
Seafood               137
Korean                133
Cocktail Bars         129
Bars                  123
Breakfast & Brunch    117
Latin American        112
Coffee & Tea          107
Ramen                  98
Middle Eastern         98
Vietnamese             93
American               86
Pubs                   80
Greek                  75
Steakhouses            75
Spanish                74
Burgers                74
Cafes                  73
Dim Sum                71
Name: count, dtype: int64

In [14]:
def recommend_restaurants_with_cuisine_tf(model, user_id, cuisine=None, num_recommendations=20):
    # Convert the user ID to its embedding using the user model
    user_id_lookup = tf.constant([user_id])
    user_embedding = model.user_model(user_id_lookup)

    restaurant_details = []    
    # Iterate through interactions to filter by cuisine
    for features in interactions:
        if cuisine is None or features['cuisine'].numpy() == cuisine.encode():
            restaurant_details.append((features['restaurant_id'].numpy(), features['cuisine'].numpy()))

    # If filtering reduced the set significantly, adjust the number of recommendations
    num_recommendations = min(num_recommendations, len(restaurant_details))

    # Convert to tensors
    filtered_restaurant_ids = [details[0] for details in restaurant_details]
    restaurant_ids_tensor = tf.constant(filtered_restaurant_ids)

    # Generate embeddings for filtered restaurants
    restaurant_embeddings = model.restaurant_model(restaurant_ids_tensor)

    # Calculate the similarity
    user_embedding_broadcasted = tf.broadcast_to(user_embedding, restaurant_embeddings.shape)
    similarities = tf.reduce_sum(user_embedding_broadcasted * restaurant_embeddings, axis=1)

    # Rank the restaurants by similarity
    top_indices = tf.argsort(similarities, direction='DESCENDING')[:num_recommendations]
    top_restaurant_ids = tf.gather(filtered_restaurant_ids, top_indices).numpy()

    # Predict ratings for the top recommendations
    user_ids = tf.convert_to_tensor([user_id] * num_recommendations)
    restaurant_ids = tf.convert_to_tensor(top_restaurant_ids)
    
    _, _, ratings = model({

        'user_id': user_ids,
        'restaurant_id': restaurant_ids
    })
    
    return top_restaurant_ids, ratings.numpy().flatten()

user_id = 'test_user@gmail.com'
desired_cuisine = 'Italian'  # set to None if needed

recommendations, predicted_ratings = recommend_restaurants_with_cuisine_tf(model, user_id, desired_cuisine)
# Convert byte IDs to strings
recommendations = [rid.decode('utf-8') if isinstance(rid, bytes) else rid for rid in recommendations]
results_df = pd.DataFrame({
    'restaurant_id': recommendations,
    'predicted_rating': predicted_ratings
})
yelp_restaurants = pd.read_csv('cleaned_yelp_restaurants.csv', encoding='UTF-8')
# Merge with the main yelp_restaurants DataFrame to get all details
full_details_df = pd.merge(results_df, yelp_restaurants, on='restaurant_id', how='left')
full_details_df = full_details_df.sort_values(by='predicted_rating', ascending=False)
full_details_df

Unnamed: 0,restaurant_id,predicted_rating,restaurant_name,image_url,is_closed,url,review_count,rating,categories,transactions,...,queried_term,queried_location,sort_by,attributes,cuisine_0,cuisine_1,cuisine_2,price_num,review_count_bins,neighborhood
0,5k_jwjhdFMPUNWhouGNzuA,4.126443,Misi,https://s3-media2.fl.yelpcdn.com/bphoto/bShBCX...,False,https://www.yelp.com/biz/misi-brooklyn?adjust_...,752,4.1,Italian,delivery,...,Calabrian,"New York, NY",best_match,,Italian,,,3.0,501-1000,
1,h37t9rA06Sr4EetJjKrfzw,4.12255,Don Angie,https://s3-media2.fl.yelpcdn.com/bphoto/onJX6_...,False,https://www.yelp.com/biz/don-angie-new-york?ad...,869,4.5,"Italian, New American",delivery,...,Italian,"New York, NY",best_match,,Italian,New American,,3.0,501-1000,Greenwich Village and Soho
3,jnHbdsqlTKlPcmJ8BCP9-g,4.119742,Via Carota,https://s3-media3.fl.yelpcdn.com/bphoto/Dr3KkW...,False,https://www.yelp.com/biz/via-carota-new-york-c...,1002,3.9,Italian,delivery,...,Meatballs,"New York, NY",best_match,,Italian,,,3.0,1001-5000,Greenwich Village and Soho
2,YT5Ywu9y190B4IGIlBetzA,4.119274,Morandi,https://s3-media3.fl.yelpcdn.com/bphoto/WmRzvC...,False,https://www.yelp.com/biz/morandi-new-york?adju...,834,3.6,"Italian, Desserts, Breakfast & Brunch","delivery, pickup",...,Meatballs,"New York, NY",best_match,,Italian,Desserts,Breakfast & Brunch,2.0,501-1000,Greenwich Village and Soho
4,zwOAiVT4pAmpNGXzj-t5MA,4.104455,Lilia,https://s3-media3.fl.yelpcdn.com/bphoto/iWOHtU...,False,https://www.yelp.com/biz/lilia-brooklyn?adjust...,1282,3.9,"Italian, Cocktail Bars",delivery,...,Blowfish,"New York, NY",best_match,,Italian,Cocktail Bars,,4.0,1001-5000,Greenpoint
9,k5fd-RSdG4IuXnnGSQ5oOw,3.081637,San Marzano Pasta Fresca,https://s3-media2.fl.yelpcdn.com/bphoto/1CtNBV...,False,https://www.yelp.com/biz/san-marzano-pasta-fre...,1626,3.8,"Italian, Bars, Breakfast & Brunch","delivery, pickup",...,Italian,"New York, NY",best_match,,Italian,Bars,Breakfast & Brunch,2.0,1001-5000,Lower East Side
10,E1RvMNZ4re4TnKarjx14Zw,3.066178,Cacio E Pepe,https://s3-media1.fl.yelpcdn.com/bphoto/LFV4f6...,False,https://www.yelp.com/biz/cacio-e-pepe-new-york...,743,3.7,"Italian, Bars","delivery, pickup",...,Flatbread,"New York, NY",best_match,,Italian,Bars,,2.0,501-1000,Lower East Side
7,cbhdOSn-nezgnH3lxPJM_g,3.065326,Sotto Voce,https://s3-media3.fl.yelpcdn.com/bphoto/2LCUhE...,False,https://www.yelp.com/biz/sotto-voce-brooklyn?a...,368,3.4,Italian,"delivery, pickup",...,Meatballs,"New York, NY",best_match,,Italian,,,2.0,201-500,Northwest Brooklyn
11,A4_L8PmSH6ZsSiIFLS-_2w,3.060946,Cozzolino Nyc,https://s3-media2.fl.yelpcdn.com/bphoto/2t1vqw...,False,https://www.yelp.com/biz/cozzolino-nyc-new-yor...,6,4.0,Italian,pickup,...,,,,,Italian,,,,0-10,Northwest Queens
6,T-RXDLq_VEv7s1_SooYIYQ,3.059958,Osteria 57,https://s3-media2.fl.yelpcdn.com/bphoto/KD1qrK...,False,https://www.yelp.com/biz/osteria-57-new-york?a...,343,4.3,"Italian, Seafood, Wine Bars","delivery, pickup",...,Sicilian,"New York, NY",best_match,,Italian,Seafood,Wine Bars,3.0,201-500,Chelsea and Clinton
