## Final Project: Restaurant Recommender System - Yelp Restaurant Recommendation
##### Natalie Kim

#### Libraries

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

### Load Data

##### Businesses data

In [2]:
data_file = open("/Users/nataliekim/Downloads/yelp_dataset/yelp_academic_dataset_business.json")
data = []
for line in data_file:   
    data.append(json.loads(line))

business_df = pd.DataFrame(data)
data_file.close()

##### Reviews data

In [3]:
data_file = open("/Users/nataliekim/Downloads/yelp_dataset/yelp_academic_dataset_review.json")
data = []
for line in data_file:   
    data.append(json.loads(line))

reviews_df = pd.DataFrame(data)
data_file.close()

### Data Preparation

#### Subset for restaurants in Philadelphia

In [4]:
# Businesses in Philadelphia
philly = business_df[(business_df['city'] == 'Philadelphia')]

In [5]:
# Philly Restaurants
philly_rest = philly[philly['categories'].str.contains('Restaurant', na=False)]

In [6]:
restaurants = philly_rest[['business_id','name','address', 'categories', 'attributes','stars']]

In [None]:
restaurants.info()

In [None]:
restaurants.head(5)

#### Combine all data

In [13]:
reviews_df2 = reviews_df[['user_id','business_id']]

In [8]:
# add name of business to reviews df
all_data = pd.merge(reviews_df2, restaurants, on='business_id')

In [9]:
# Round stars column to integers rather than floats
star_round = {1.0:1,1.5:2, 2.0:2, 2.5:3, 3.0:3, 3.5:4, 4.0:4, 4.5:5, 5.0:5}

all_data['stars'] = all_data['stars'].map(star_round)
restaurants['stars'] = restaurants['stars'].map(star_round)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurants['stars'] = restaurants['stars'].map(star_round)


#### Train/Test Split of Data

In [14]:
train_data, test_data = train_test_split(all_data, test_size=0.1, random_state=42)

#### User-Restaurant Matrix (using Reviews table)

In [15]:
# Create user-item matrix for training data
train_user_rest = train_data.pivot_table(values='stars', index='user_id', columns='name', fill_value=0)

# Create user-item matrix for test data
test_user_rest = test_data.pivot_table(values='stars', index='user_id', columns='name', fill_value=0)

#### Dummy Matrices - Categories & Attributes

In [17]:
# Expand categories column
categ_dummies = pd.Series(restaurants['categories']).str.get_dummies(',')
#categ_dummies

In [None]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)

# convert string to dictionary
import ast
def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}") 

In [None]:
# get dummies from nested attributes
restaurants['BusinessParking'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'BusinessParking')), axis=1)
restaurants['Ambience'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Ambience')), axis=1)
restaurants['GoodForMeal'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'GoodForMeal')), axis=1)
restaurants['Dietary'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Dietary')), axis=1)
restaurants['Music'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Music')), axis=1)

In [None]:
restaurants

In [None]:
# attribute dummy table
attr_dummies = pd.concat([restaurants['attributes'].apply(pd.Series), 
                          restaurants['BusinessParking'].apply(pd.Series),
                          restaurants['Ambience'].apply(pd.Series), 
                          restaurants['GoodForMeal'].apply(pd.Series),
                          restaurants['Dietary'].apply(pd.Series)], axis=1)
attr_dummies = pd.get_dummies(attr_dummies)
#attr_dummies

In [None]:
# Finalize table
df_final = pd.concat([attr_dummies,
                      categ_dummies, 
                      restaurants[['name','stars']]], axis=1)
df_final.drop('Restaurants',inplace=True,axis=1)
#df_final.head()

#### Similarity Matrix

In [None]:
df_finalv2 = df_final
df_finalv2.head()

In [None]:
df_finalv2.drop('stars',inplace=True,axis=1)
df_finalv2 = pd.DataFrame(df_finalv2)
df_finalv2.set_index('name', inplace=True)

In [None]:
# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(df_finalv2)

# Convert the similarity matrix to a DataFrame for easier manipulation
similarity_df = pd.DataFrame(similarity_matrix, index=df_finalv2.index, columns=df_finalv2.index)

### Collaborative-Filtering Model

In [None]:
# item-item similarity matrix
item_similarity_matrix = cosine_similarity(train_user_rest.T)
item_similarity_df = pd.DataFrame(item_similarity_matrix, index=train_user_rest.columns, columns=train_user_rest.columns)

In [None]:
# Step 3: Predict ratings
def predict_ratings(user_rest_matrix, similarity_df):
    predicted_ratings = np.dot(user_rest_matrix, similarity_df)
    return pd.DataFrame(predicted_ratings, index=user_rest_matrix.index, columns=similarity_df.columns)

train_predictions = predict_ratings(train_user_rest, item_similarity_df)

In [None]:
# Step 4: Evaluate predictions
def evaluate_predictions(test_matrix, predicted_matrix, top_n=5):
    hits = 0
    total = 0
    
    for user in test_matrix.index:
        true_items = test_matrix.loc[user][test_matrix.loc[user] > 0].index
        predicted_items = predicted_matrix.loc[user].sort_values(ascending=False).head(top_n).index
        hits += len(set(predicted_items) & set(true_items))
        total += len(true_items)
    
    precision = hits / (total * top_n)
    recall = hits / total
    
    return precision, recall

In [None]:
# Evaluate on test set
precision, recall = evaluate_predictions(test_user_rest, train_predictions, top_n=5)
print(f"Precision@5: {precision}")
print(f"Recall@5: {recall}")

### Content Based Filtering

In [None]:
# function that returns recommendations based on cosine similarity
def content_based_recommendations(restaurant, similarity_df):
    # Dot product of user-item matrix and similarity matrix
    predicted_ratings = np.dot(user_rest_matrix, similarity_df)
    return pd.DataFrame(predicted_ratings, index=user_rest_matrix.index, columns=similarity_df.columns)

In [None]:
train_predictions = content_based_recommendations(train_user_rest, similarity_df)

#### Evaluating model

In [None]:
def evaluate_predictions(test_matrix, predicted_matrix, top_n=5):
    hits = 0
    total = 0
    
    for user in test_matrix.index:
        # Get the items the user has interacted with in the test set
        true_items = test_matrix.loc[user][test_matrix.loc[user] > 0].index
        
        # Get the top N predicted items for the user
        predicted_items = predicted_matrix.loc[user].sort_values(ascending=False).head(top_n).index
        
        # Count hits
        hits += len(set(predicted_items) & set(true_items))
        total += len(true_items)
    
    precision = hits / (total * top_n)
    recall = hits / total
    
    return precision, recall

In [None]:
# Evaluate on test set
precision, recall = evaluate_predictions(test_user_rest, train_predictions, top_n=5)
print(f"Precision@5: {precision}")
print(f"Recall@5: {recall}")