In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the restaurant dataset for collaborative filtering
df_restaurants = pd.read_csv('../data/cleaned_restaurants.csv')

In [None]:
import pandas as pd
import numpy as np
# Load your data (assumes a CSV file with the columns as described)
df = pd.read_csv('../data/cleaned_restaurants.csv')

# Separate out the different feature types.
# Here we assume:
#   - The first three columns after business_id are latitude, longitude, and stars.
#   - The remaining columns are binary indicators for each restaurant category.

# Save the business IDs for later use.
business_ids = df['business_id']

# Create a DataFrame of the numerical features.
num_features = df[['latitude', 'longitude', 'stars']]

# And a DataFrame of the categorical features.
cat_features = df.drop(['business_id', 'latitude', 'longitude', 'stars'], axis=1)

# Scale numerical features so that they are on a comparable scale.
scaler = StandardScaler()
num_features_scaled = scaler.fit_transform(num_features)

# You might want to weight the different parts differently.
# For example, if you feel geographic location should have more influence, adjust its weight.
weight_num = 1.0  # overall weight for latitude, longitude, stars
weight_cat = 1.0  # overall weight for category info

# Combine the numerical and categorical features into one feature matrix.
# (Here the categorical features are left as 0/1. Adjust weight_cat if you want to scale them.)
combined_features = np.hstack([num_features_scaled * weight_num, cat_features.values * weight_cat])

# Compute the pairwise cosine similarity matrix between restaurants.
sim_matrix = cosine_similarity(combined_features)

# Define a recommendation function.
def recommend_restaurants(liked_ids, disliked_ids, df, sim_matrix, top_n=5):
    """
    Recommend restaurants based on both liked and disliked restaurants.

    liked_ids: list of business_id strings that the user likes.
    disliked_ids: list of business_id strings that the user dislikes.
    df: the original DataFrame.
    sim_matrix: precomputed similarity matrix between restaurants.
    top_n: number of recommendations to return.
    """
    # Get indices for liked and disliked restaurants.
    liked_indices = df.index[df['business_id'].isin(liked_ids)].tolist()
    disliked_indices = df.index[df['business_id'].isin(disliked_ids)].tolist()
    
    # Compute average similarity scores from liked restaurants.
    if liked_indices:
        liked_sim = sim_matrix[liked_indices].mean(axis=0)
    else:
        liked_sim = np.zeros(sim_matrix.shape[0])
    
    # Compute average similarity scores from disliked restaurants.
    if disliked_indices:
        disliked_sim = sim_matrix[disliked_indices].mean(axis=0)
    else:
        disliked_sim = np.zeros(sim_matrix.shape[0])
    
    # Calculate the net score by subtracting the disliked similarity from liked similarity.
    net_scores = liked_sim - disliked_sim
    
    # Exclude already rated restaurants (both liked and disliked).
    for idx in liked_indices + disliked_indices:
        net_scores[idx] = -np.inf
    
    # Get indices of the top recommended restaurants.
    rec_indices = np.argsort(net_scores)[::-1][:top_n]
    
    return df.iloc[rec_indices]['business_id'].tolist()

Recommended Restaurants: ['s30pUay0opzvriFweACWEA', 'BZUlza2S5sfavO6UsS5mNQ', 'woXVP3vViYzJ-I7jwWS5Og', 'gWE14ed0pcUlQo2IETmFmQ', '9_g3DU-5BnpTH1DC1kIe-w']


In [12]:
# Example usage:
# Assume the user has a favorite restaurant with business_id 'MTSW4McQd7CbVtyjqoe9mw'
liked_ids = ['MTSW4McQd7CbVtyjqoe9mw']
disliked_ids = ['bBDDEgkFA1Otx9Lfe7BZUQ']
recommendations = recommend_restaurants(liked_ids, disliked_ids, df, sim_matrix, top_n=5)
restaurant_full_data = pd.read_json('../data/cleaned_restaurants.json', lines=True)
print("Recommended Restaurants:", recommendations)
# Print liked restaurant names:
print("Liked Restaurant Names:")
print(restaurant_full_data[df_restaurants['business_id'].isin(liked_ids)]['name'])
# Print disliked restaurant names:
print("Disliked Restaurant Names:")
print(restaurant_full_data[df_restaurants['business_id'].isin(disliked_ids)]['name'])
# Print recommendation Names
print("Recommended Restaurant Names:")
print(restaurant_full_data[df_restaurants['business_id'].isin(recommendations)]['name'])

Recommended Restaurants: ['s30pUay0opzvriFweACWEA', 'BZUlza2S5sfavO6UsS5mNQ', 'woXVP3vViYzJ-I7jwWS5Og', 'gWE14ed0pcUlQo2IETmFmQ', '9_g3DU-5BnpTH1DC1kIe-w']
Liked Restaurant Names:
0    St Honore Pastries
Name: name, dtype: object
Disliked Restaurant Names:
3    Sonic Drive-In
Name: name, dtype: object
Recommended Restaurant Names:
921               ICED by Betsy
12419             Simply Sherry
20468    MOTW Coffee & Pastries
25001             Pie Lady Cafe
26073              Essen Bakery
Name: name, dtype: object
