## Final Project: Restaurant Recommender System - Yelp Restaurant Recommendation
##### Natalie Kim

#### Libraries

In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from math import sqrt

### Load Data

##### Businesses data

In [3]:
data_file = open("/Users/nataliekim/Downloads/yelp_dataset/yelp_academic_dataset_business.json")
data = []
for line in data_file:   
    data.append(json.loads(line))

business_df = pd.DataFrame(data)
data_file.close()

##### Reviews data

In [4]:
data_file = open("/Users/nataliekim/Downloads/yelp_dataset/yelp_academic_dataset_review.json")
data = []
for line in data_file:   
    data.append(json.loads(line))

reviews_df = pd.DataFrame(data)
data_file.close()

### Data Preparation

#### Subset for restaurants in Philadelphia

In [5]:
# Businesses in Philadelphia
philly = business_df[(business_df['city'] == 'Philadelphia')]

In [6]:
# Philly Restaurants
philly_rest = philly[philly['categories'].str.contains('Restaurant', na=False)]

In [7]:
restaurants = philly_rest[['business_id','name','address', 'categories', 'attributes','stars']]

In [8]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5854 entries, 3 to 150336
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   business_id  5854 non-null   object 
 1   name         5854 non-null   object 
 2   address      5854 non-null   object 
 3   categories   5854 non-null   object 
 4   attributes   5801 non-null   object 
 5   stars        5854 non-null   float64
dtypes: float64(1), object(5)
memory usage: 320.1+ KB


In [9]:
restaurants.head(5)

Unnamed: 0,business_id,name,address,categories,attributes,stars
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'RestaurantsDelivery': 'False', 'OutdoorSeati...",4.0
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,"Sushi Bars, Restaurants, Japanese","{'RestaurantsReservations': 'True', 'Restauran...",4.0
19,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,"Korean, Restaurants","{'NoiseLevel': 'u'quiet'', 'GoodForMeal': '{'d...",4.5
28,QdN72BWoyFypdGJhhI5r7g,Bar One,767 S 9th St,"Cocktail Bars, Bars, Italian, Nightlife, Resta...","{'Smoking': 'u'no'', 'NoiseLevel': 'u'average'...",4.0
31,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,4105 Main St,"Pizza, Restaurants, Salad, Soup","{'RestaurantsReservations': 'False', 'Caters':...",3.0


#### Combine all data

In [10]:
reviews_df2 = reviews_df[['user_id','business_id']]

In [11]:
# add name of business to reviews df
all_data = pd.merge(reviews_df2, restaurants, on='business_id')

In [12]:
# Round stars column to integers rather than floats
star_round = {1.0:1,1.5:2, 2.0:2, 2.5:3, 3.0:3, 3.5:4, 4.0:4, 4.5:5, 5.0:5}

all_data['stars'] = all_data['stars'].map(star_round)
restaurants['stars'] = restaurants['stars'].map(star_round)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurants['stars'] = restaurants['stars'].map(star_round)


In [13]:
all_data.head()

Unnamed: 0,user_id,business_id,name,address,categories,attributes,stars
0,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,Zaika,2481 Grant Ave,"Halal, Pakistani, Restaurants, Indian","{'Caters': 'True', 'Ambience': '{'romantic': F...",4
1,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,Zaika,2481 Grant Ave,"Halal, Pakistani, Restaurants, Indian","{'Caters': 'True', 'Ambience': '{'romantic': F...",4
2,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,Zaika,2481 Grant Ave,"Halal, Pakistani, Restaurants, Indian","{'Caters': 'True', 'Ambience': '{'romantic': F...",4
3,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,Zaika,2481 Grant Ave,"Halal, Pakistani, Restaurants, Indian","{'Caters': 'True', 'Ambience': '{'romantic': F...",4
4,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,Zaika,2481 Grant Ave,"Halal, Pakistani, Restaurants, Indian","{'Caters': 'True', 'Ambience': '{'romantic': F...",4


#### Train/Test Split of Data

In [14]:
train_data, test_data = train_test_split(all_data, test_size=0.1, random_state=42)

#### User-Restaurant Matrix (using Reviews table)

In [15]:
# Create user-item matrix for training data
train_user_rest = train_data.pivot_table(values='stars', index='user_id', columns='name', fill_value=0)

# Create user-item matrix for test data
test_user_rest = test_data.pivot_table(values='stars', index='user_id', columns='name', fill_value=0)

#### Dummy Matrices - Categories & Attributes

In [16]:
# Expand categories column
categ_dummies = pd.Series(restaurants['categories']).str.get_dummies(',')
#categ_dummies

In [17]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)

# convert string to dictionary
import ast
def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}") 

In [18]:
# get dummies from nested attributes
restaurants['BusinessParking'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'BusinessParking')), axis=1)
restaurants['Ambience'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Ambience')), axis=1)
restaurants['GoodForMeal'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'GoodForMeal')), axis=1)
restaurants['Dietary'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Dietary')), axis=1)
restaurants['Music'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Music')), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurants['BusinessParking'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'BusinessParking')), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurants['Ambience'] = restaurants.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Ambience')), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [19]:
# attribute dummy table
attr_dummies = pd.concat([restaurants['attributes'].apply(pd.Series), 
                          restaurants['BusinessParking'].apply(pd.Series),
                          restaurants['Ambience'].apply(pd.Series), 
                          restaurants['GoodForMeal'].apply(pd.Series),
                          restaurants['Dietary'].apply(pd.Series)], axis=1)
attr_dummies = pd.get_dummies(attr_dummies)
#attr_dummies

In [20]:
# Finalize table
df_final = pd.concat([attr_dummies,
                      categ_dummies, 
                      restaurants[['name','stars']]], axis=1)
df_final.drop('Restaurants',inplace=True,axis=1)
df_final.head()

Unnamed: 0,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,BikeParking_False,...,Vegan,Vegetarian,Venues & Event Spaces,Vietnamese,Waffles,Wine Bars,Wineries,Wraps,name,stars
3,True,False,False,True,False,False,True,False,False,False,...,0,0,0,0,0,0,0,0,St Honore Pastries,4
15,False,False,True,False,False,True,False,False,True,False,...,0,0,0,0,0,0,0,0,Tuna Bar,4
19,False,True,False,False,True,False,False,False,True,False,...,0,0,0,0,0,0,0,0,BAP,5
28,False,False,True,True,False,False,False,False,True,False,...,0,0,0,0,0,0,0,0,Bar One,4
31,False,False,True,False,False,True,False,False,True,False,...,0,0,0,0,0,0,0,0,DeSandro on Main,3


In [21]:
X = df_final.iloc[:,:-2]
y = df_final['stars']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### Similarity Matrix

In [23]:
df_finalv2 = df_final
#df_finalv2.head()

In [24]:
df_finalv2.drop('stars',inplace=True,axis=1)
df_finalv2 = pd.DataFrame(df_finalv2)
df_finalv2.set_index('name', inplace=True)

In [25]:
# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(df_finalv2)

# Convert the similarity matrix to a DataFrame for easier manipulation
similarity_df = pd.DataFrame(similarity_matrix, index=df_finalv2.index, columns=df_finalv2.index)

### Content Based Filtering

In [26]:
# Use Cross Validation to find best parameters
param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25, 30]
}

In [27]:
# Instantiate and fit the model
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

In [28]:
# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best estimator
best_knn = grid_search.best_estimator_

# Evaluate the model on the training set
train_accuracy = best_knn.score(X_train, y_train)
print(f"Training accuracy with best parameters: {train_accuracy}")

# Evaluate the model on the test set
test_accuracy = best_knn.score(X_test, y_test)
print(f"Test accuracy with best parameters: {test_accuracy}")

Best parameters: {'n_neighbors': 30}
Training accuracy with best parameters: 0.5396735003796507
Test accuracy with best parameters: 0.515358361774744


### Collaborative-Filtering Model

User-item matrix: train_user_rest

In [29]:
train_user_rest.head()

name,$5 Fresh Burger Stop,&pizza - UPenn,&pizza - Walnut,1 Stop Pizza,10 Arts Bistro,1100 Social,1201 Bar,1225Raw Sushi and Sake Lounge,12th Street Cantina,13 Restaurant,...,iPho,iPho Vietnamese Restaurant,iSushi,la bamba,moonbowls,nunu,penrose cheesesteak,revive 21,sweetgreen,¡Juice!
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---r61b7EpVPkb4UVme5tA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--0kuuLmuYBe3Rmu0Iycww,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--13zE3NaRvLSrmfTVnFJA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--2tyArRmSoyKx5r-FVG0A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--2vR0DIsmQ6WfcSzKWigw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Align columns of train and test user-item matrices
train_columns = train_user_rest.columns
test_columns = test_user_rest.columns

# Identify missing columns
missing_test_columns = list(set(train_columns) - set(test_columns))
missing_train_columns = list(set(test_columns) - set(train_columns))

# Create DataFrames with missing columns filled with zeros
missing_test_df = pd.DataFrame(0, index=test_user_rest.index, columns=missing_test_columns)
missing_train_df = pd.DataFrame(0, index=train_user_rest.index, columns=missing_train_columns)

# Concatenate the original and missing columns DataFrames
test_user_rest = pd.concat([test_user_rest, missing_test_df], axis=1)
train_user_rest = pd.concat([train_user_rest, missing_train_df], axis=1)

In [31]:
# Reorder columns to match the training set
test_user_rest = test_user_rest[train_columns]

In [32]:
# Normalize ratings
user_ratings_mean = train_user_rest.mean(axis=1)
normalized_matrix = train_user_rest.subtract(user_ratings_mean, axis=0)

In [33]:
# Dimensionality reduction using SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=12, random_state=42)
latent_matrix = svd.fit_transform(normalized_matrix)

In [34]:
from sklearn.linear_model import Ridge

# Fit Ridge regression model on the latent factors
ridge = Ridge(alpha=1.0)
ridge.fit(latent_matrix, normalized_matrix)

In [35]:
# Predict ratings
predicted_ratings = ridge.predict(latent_matrix)

#### Evaluation

In [36]:
# Normalize test ratings
test_user_ratings_mean = test_user_rest.mean(axis=1)
normalized_test_matrix = test_user_rest.subtract(test_user_ratings_mean, axis=0)

In [37]:
# Transform the test data using the fitted SVD model
test_latent_matrix = svd.transform(normalized_test_matrix)

In [38]:
# Predict normalized ratings
predicted_normalized_ratings = ridge.predict(test_latent_matrix)

In [39]:
# Reconstruct the original ratings
predicted_ratings = pd.DataFrame(predicted_normalized_ratings, index=test_user_rest.index, columns=test_user_rest.columns)
predicted_ratings = predicted_ratings.add(test_user_ratings_mean, axis=0)

In [40]:
# Flatten the matrices and filter out zero ratings
actual_ratings = test_user_rest.values.flatten()
predicted_ratings_flat = predicted_ratings.values.flatten()

# Filter out zero ratings (assuming zero means no rating)
non_zero_indices = actual_ratings > 0
actual_ratings_non_zero = actual_ratings[non_zero_indices]
predicted_ratings_non_zero = predicted_ratings_flat[non_zero_indices]

# Calculate RMSE
rmse = sqrt(mean_squared_error(actual_ratings_non_zero, predicted_ratings_non_zero))
print(f"Test RMSE: {rmse}")

Test RMSE: 3.9264979318587176
