# 1. Load Data

In [1]:
import numpy as np
import pandas as pd

In [3]:
#Read business object
business_obj_loc = 'yelp_dataset/yelp_academic_dataset_business.json'
business_df = pd.read_json(business_obj_loc, lines=True)
columns = business_df.head()
columns 

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [4]:
#Replace null values in business df
business_df.fillna('NA', inplace=True)

In [5]:
#Select businesses in Nashville only
df_City = business_df[business_df['city']=='Nashville']
#Show number of open and closed businesses
df_City[['is_open','business_id']].groupby(['is_open']).count()

Unnamed: 0_level_0,business_id
is_open,Unnamed: 1_level_1
0,1573
1,5398


In [6]:
#Select open businesses only
df_City = df_City[df_City['is_open']==1]
count_row = df_City.shape[0]
print(str(count_row))

5398


In [7]:
#Find categories that contain the keyword 'Restaurants'
df_City_Restaurants = df_City[df_City['categories'].str.contains('Restaurants')]
count_row = df_City_Restaurants.shape[0]
print(str(count_row))

1681


In [8]:
#Show number restaurants by category in the selected city
df_categories = df_City_Restaurants[['categories','business_id']].groupby(['categories']).count()
#df_categories.sort_values(by=['business_id'],ascending=False, inplace=True)
#df_categories[df_categories['business_id']>10]

In [9]:
#Save selected city restaurants categories to csv file
df_categories.to_csv('categories_selected_city_restaurants.csv', sep=',')

In [9]:
#Read reviews object and display it's columns

b_pandas = []
r_dtypes = {"stars": np.float16, 
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32,
           }

review_obj_loc = 'yelp_dataset/yelp_academic_dataset_review.json'
reader = pd.read_json(review_obj_loc, orient="records", lines=True, dtype=r_dtypes, chunksize=1000)

for chunk in reader:
    reduced_chunk = chunk[chunk['business_id'].isin(df_City_Restaurants['business_id'])]
    reduced_chunk = reduced_chunk.replace(',',';')
    b_pandas.append(reduced_chunk)
    
reviews_City_Restaurants = pd.concat(b_pandas, ignore_index=True)

#Write File to csv
reviews_City_Restaurants.to_csv('reviews_selected_city_restaurants.csv', sep=',')

count_row = reviews_City_Restaurants.shape[0]
print('There are  ' + str(count_row) + ' reviews for restaurants in selected city')

There are  266634 reviews for restaurants in selected city


# 2. Text Pre-Processing

In [78]:
import string
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer


In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marcel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
def clean_text(string_input):
    ## Remove puncuation
    string_input = string_input.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    string_input = string_input.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    string_input = [w for w in string_input if not w in stops and len(w) >= 3]
    
    string_input = " ".join(string_input)
    
    # Clean the text
    string_input = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", string_input)
    string_input = re.sub(r"what's", "what is ", string_input)
    string_input = re.sub(r"\'s", " ", string_input)
    string_input = re.sub(r"\'ve", " have ", string_input)
    string_input = re.sub(r"n't", " not ", string_input)
    string_input = re.sub(r"i'm", "i am ", string_input)
    string_input = re.sub(r"\'re", " are ", string_input)
    string_input = re.sub(r"\'d", " would ", string_input)
    string_input = re.sub(r"\'ll", " will ", string_input)
    string_input = re.sub(r",", " ", string_input)
    string_input = re.sub(r"\.", " ", string_input)
    string_input = re.sub(r"!", " ! ", string_input)
    string_input = re.sub(r"\/", " ", string_input)
    string_input = re.sub(r"\^", " ^ ", string_input)
    string_input = re.sub(r"\+", " + ", string_input)
    string_input = re.sub(r"\-", " - ", string_input)
    string_input = re.sub(r"\=", " = ", string_input)
    string_input = re.sub(r"'", " ", string_input)
    string_input = re.sub(r":", " : ", string_input)
    string_input = re.sub(r"e - mail", "email", string_input) 
    return string_input

In [12]:
#Clean up comments
reviews_City_Restaurants['text'] = reviews_City_Restaurants['text'].apply(clean_text)

In [13]:
#Save Phiiladelphia restaurants reviews to csv file
reviews_City_Restaurants.to_csv('reviews_selected_city_restaurants_Clean.csv', sep=',')

In [37]:
#Read Phiiladelphia restaurants reviews from csv file
reviews_City_Restaurants = pd.read_csv('reviews_selected_city_restaurants_Clean.csv')

In [38]:
#Find number of characters in a review
def count_text(string_input):
    string_input = str(string_input)
    nbr_chars = len(string_input)
    return nbr_chars


reviews_City_Restaurants['nbr_characters'] = reviews_City_Restaurants['text'].apply(count_text)

In [52]:
#Select a subset of reviews for better quality
#Select reviews within the last 5 years
reviews_City_Restaurants['year'] = reviews_City_Restaurants['date'].str[:4]
reviews_City_Restaurants_reduced = reviews_City_Restaurants[reviews_City_Restaurants['year'].isin(['2018','2019','2020','2021','2022'])]

#Select reviews with at least 50 characters
reviews_City_Restaurants_reduced = reviews_City_Restaurants_reduced[reviews_City_Restaurants['nbr_characters']>49]

  


In [53]:
reviews_City_Restaurants_reduced.shape

(138656, 12)

In [74]:
#Group the reviews by business
reviews_grouped_by_business = reviews_City_Restaurants_reduced.drop(['row_nbr','review_id','user_id','stars','useful','funny','cool','date','year','nbr_characters'], axis=1)
reviews_grouped_by_business.reset_index()

reviews_grouped_by_business['reviews'] = reviews_grouped_by_business.groupby(['business_id'])['text'].transform(lambda x : ' | '.join(x))

# drop duplicate data
reviews_grouped_by_business = reviews_grouped_by_business.drop(['text'], axis=1)
reviews_grouped_by_business = reviews_grouped_by_business.drop_duplicates()

reviews_grouped_by_business.shape

(1660, 2)

In [77]:
reviews_grouped_by_business.head()

Unnamed: 0,business_id,reviews
1,Zx7n8mdt8OzLRXVzolXNhQ,amazing biscuits fill blank great cocktails...
9,pSmOH4a3HNNpYM82J5ycLA,pretty good pancake place little disorganized...
13,wzE61ThXOdrSegvwSOzf5w,totally worth wait despite heat ! vegan grits...
29,wnjFEC-w0qWnyDnO8k1RpA,love ! ! new favorite local spot we have g...
33,yE1raqkLX7OZsjmX3qKIKg,arrived lunch 12p friday busy all took foreve...


In [79]:
#Vectorize reviews
count_vectorizer = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
reviews_grouped_by_business = reviews_grouped_by_business.fillna('')
vectorized_reviews = count_vectorizer.fit_transform(reviews_grouped_by_business['reviews'])

In [80]:
vectorized_reviews.shape

(1660, 9950)

# 3. Randomly select "seed" restaurant

In [81]:
#Randomly select a restaurant
print(df_City_Restaurants.iloc[200,:])
print(df_City_Restaurants.iloc[200,12])

business_id                                HCHHrf21UAgbxAi8T4Q4Iw
name                                         White Duck Taco Shop
address                                             423 6th Ave S
city                                                    Nashville
state                                                          TN
postal_code                                                 28801
latitude                                                  36.1549
longitude                                                -86.7773
stars                                                           4
review_count                                                   42
is_open                                                         1
attributes      {'BusinessParking': '{'garage': False, 'street...
categories            American (New), Tacos, Mexican, Restaurants
hours           {'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...
Name: 16763, dtype: object
American (New), Tacos, Mexican, Restaurants


In [82]:
#Find all the seed restaurant's reviews by copy/paste business ID to this command
selected_reviews = reviews_City_Restaurants_reduced[reviews_City_Restaurants_reduced['business_id']=='HCHHrf21UAgbxAi8T4Q4Iw']
selected_reviews = selected_reviews['text']
selected_reviews.head()

41675    place recommended couple coworkers  disappoint...
42980    delicious tacos  many options  loved options v...
42989    food better service  staff hanging around chat...
43147    drinking yee - haw brewery  boyfriend become f...
43465    recommended friends come day before  said pork...
Name: text, dtype: object

In [83]:
#Find how many reviews the seed restaurant has
print('There are  ' + str(selected_reviews.shape[0]) + ' reviews for the selected restaurant')

There are  45 reviews for the selected restaurant


# 4. Calculate recommendations

In [84]:
from scipy.spatial.distance import cdist
# find most similar reviews
distance = cdist(count_vectorizer.transform(selected_reviews).todense().mean(axis=0), 
              vectorized_reviews.todense(),metric='cosine')

distance

array([[0.3687381 , 0.45115079, 0.37640843, ..., 0.79441875, 0.55544887,
        0.52488369]])

In [89]:
distance.size

1660

In [85]:
most_similar = distance.argsort().ravel()[:10]
most_similar

array([ 306,  610, 1176, 1180,  962,  851, 1162, 1203,    7, 1547],
      dtype=int64)

In [95]:
df_most_similar = df_City_Restaurants.loc[df_City_Restaurants['business_id'].isin(reviews_grouped_by_business['business_id'].iloc[most_similar]), ['business_id', 'categories', 'name', 'stars','review_count']]
df_most_similar

Unnamed: 0,business_id,categories,name,stars,review_count
13898,C1D2o1VV2TDjpkG3BsdseA,"Tex-Mex, Event Planning & Services, Caterers, ...",San Antonio Taco Co,3.5,386
16763,HCHHrf21UAgbxAi8T4Q4Iw,"American (New), Tacos, Mexican, Restaurants",White Duck Taco Shop,4.0,42
54127,iAID8DtHvyE5W4OyOvCczQ,"Mexican, Tacos, Restaurants, Cocktail Bars, Ba...",Pancho & Lefty's Cantina - Sylvan Park,3.5,105
80918,g-YOzV9YIPmo6vZBHQuuUg,"Restaurants, Sandwiches, Bars, Nightlife, Tex-...",Bakersfield,4.5,1215
86175,AKAYI-HTy78i4Tr_n8UEYw,"Tacos, Tex-Mex, Mexican, Restaurants",Redheaded Stranger,4.0,214
91227,2hQuKePRM5zmEFbhEFeCpQ,"Nightlife, Restaurants, Tacos, Mexican, Tex-Me...",Condado Tacos,4.5,69
107622,4szMVHmGXodrVGw4Zhev9g,"Mexican, Nightlife, Bars, Restaurants",Bartaco,4.5,1447
114374,9w-LZEkP1hf363G-OYvKDg,"Tacos, Mexican, Restaurants",Mas Tacos Por Favor,4.5,1503
118198,UV3QgpYgHO0GGTYPPr4v3g,"Restaurants, Mexican, Tex-Mex",Taqueria Del Sol,4.0,353


# 5. Initial observations
Seed restaurant category is: American (New), Tacos, Mexican, Restaurants
Recommended restaurants are all within the seed restaurant's categories, so they are adequately similar