## Web Scraper

In [1]:
# Import packages needed
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
import csv
import pandas as pd
import datetime

#Create Restaurant class to store data

class Restaurant:
    def __init__(self):
        self.name = "NA"
        self.avg_rating = "NA"
        self.address = "NA"
        self.description = "NA"
        self.service_option = "NA"
        self.opening_h = "NA"
        self.total_reviews = "NA"
        #self.ratings = []
        

class Webscraper:
    def __init__(self, restaurant_location, csv_outfile): 
        self.csv_outfile = csv_outfile

         #set up options for driver
        options = webdriver.ChromeOptions()
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        options.add_argument("start-maximized")
        options.add_argument('disable-infobars') #attempting to disable infobar

        #open chrome driver
        self.driver = webdriver.Chrome("D:\chromedriver.exe",options=options)
        self.driver.maximize_window()
        self.driver.implicitly_wait(5)

        #search for restaurants in specified location
        self.driver.get(f"https://www.google.com/search?q=restaurant {restaurant_location}")
        self.wait = WebDriverWait(self.driver, 1)

        #instantiate action class to be able to move to elements such as move to new items on the page
        self.actions = ActionChains(self.driver)

        #reject all when asked about consent data cookies
        self.wait.until(EC.element_to_be_clickable((By.ID, "W0wltc")))
        bt = self.driver.find_element(By.ID, "W0wltc")
        bt.click()

        #Move to GoogleMaps
        ActionChains(self.driver).move_to_element(self.wait.until(EC.element_to_be_clickable(	(By.XPATH, "//a[contains(@href, '/search?tbs')]")))).perform()
        self.wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/search?tbs')]"))).click()

 
    def scraper(self, no_pages_to_search = 10, list_already_scrapepd = None):
        '''
        This function scrapes common (name, address, opening hours, total reviews) 
        information about the restaurants shown in a page of GoogleMaps
        '''
        #loop through the pages of restaurant results
        for page in range(1,no_pages_to_search):
        
            businesses_list =  self.driver.find_elements(By.XPATH, "//div[@aria-level='3']")

            for business in businesses_list:  

                #create resturant object to store the reviews
                restaurant = Restaurant()
                
                try:
                    business.click()
                except Exception:
                    continue

                if list_already_scrapepd is not None and business.text in list_already_scrapepd:
                    #Google Webpages are dynamical so scraper can break at times. 
                    #The list_already_scrapped is to be used so that the scraper does not do duplicate work
                    continue
                
                time.sleep(1)
                #use this in csv if only restaurant specific data and not by reviews needs to be extracted
                restaurant_info= self.get_restaurant_main_info(restaurant, business)
               
                # Add new review to restaurants list of reviews
                all_reviews_for_restaurant_df = self.get_reviews(restaurant)
                
                #Create df from dict list of restaurant reviews and append to csv file
                all_reviews_for_restaurant_df.to_csv(self.csv_outfile, mode='a', index=False, header=False)

                #TODO if the below line is commented the function only takes reviews info. if you want restaurant specific data enable this
                #restaurant_info.to_csv(self.csv_outfile,encoding='UTF8', mode='a', index=False, header=False)

            print(f"{time.localtime()}: Reviews DONE")
                  
            page_button = self.driver.find_element(By.CSS_SELECTOR, 'a[aria-label="Page ' + str(page+1) + '"]')
            page_button.click()
            
            print('page click... wait 10 seconds...')
            time.sleep(10)

        print(f"{time.localtime()}: SCRAPING COMPLETED")

    def get_restaurant_main_info(self, restaurant, business):
        #get restaurant name,address, description,avg. rating, service option 
        restaurant.name = business.text
        try: 
            restaurant.address= WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,"LrzXr"))).text
        except TimeoutException:
            pass
        
        description_banner_text =[]
        
        try:
            description_banner = WebDriverWait(self.driver, 3).until(EC.visibility_of_all_elements_located((By.CLASS_NAME,"YhemCb"))) 
            for element in description_banner:
                description_banner_text.append(element.text)
        except Exception:
            pass
        
        detailed_description = 'NA'
        
        try: 
            more_description_liner = WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.XPATH,'/html/body/div[6]/div/div[9]/div[2]/div/div[2]/async-local-kp/div/div/div[1]/div/g-sticky-content-container/div/block-component/div/div[1]/div/div/div/div[1]/div/div/div[5]/g-flippy-carousel/div/div/ol/li[1]/span/div/div/div/div[1]/div/div/c-wiz/div/div/div')))
            detailed_description = more_description_liner.text 
        except Exception:
            pass

        try:
            restaurant.description= WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,"YhemCb"))).text
        except TimeoutException:
            pass
        try:
            restaurant.avg_rating=  WebDriverWait(self.driver, 2).until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[6]/div/div[9]/div[2]/div/div[2]/async-local-kp/div/div/div[1]/div/g-sticky-content-container/div/block-component/div/div[1]/div/div/div/div[1]/div/div/div[1]/div/div[2]/div[1]/div/div/span/span[1]'))).text
        except Exception:
            pass
        
        #get service option
        try:
            final_xpath_chars = "div[1]/c-wiz/div" # needed due to dynamical nature of google maps pages
            restaurant.service_option= WebDriverWait(self.driver, 1).until(EC.visibility_of_element_located((By.XPATH,f"/html/body/div[6]/div/div[9]/div[2]/div/div[2]/async-local-kp/div/div/div[1]/div/g-sticky-content-container/div/block-component/div/div[1]/div/div/div/div[1]/div/div/div[5]/g-flippy-carousel/div/div/ol/li[1]/span/div/div/div/{final_xpath_chars}"))).text
        except TimeoutException:
            try:
                final_xpath_chars = "div[2]/c-wiz/div"
                restaurant.service_option= WebDriverWait(self.driver, 1).until(EC.visibility_of_element_located((By.XPATH,f"/html/body/div[6]/div/div[9]/div[2]/div/div[2]/async-local-kp/div/div/div[1]/div/g-sticky-content-container/div/block-component/div/div[1]/div/div/div/div[1]/div/div/div[5]/g-flippy-carousel/div/div/ol/li[1]/span/div/div/div/{final_xpath_chars}"))).text
            except TimeoutException:
                restaurant.service_option='NA'

        #Expand restaurant opening hours widget
        try:
            # Try Except needed because GoogleMaps has two types of opening the "opening hours" widget
            self.wait.until(EC.element_to_be_clickable(	(By.CLASS_NAME, 'JjSWRd'))).click()
        except TimeoutException: 
            try:
                self.wait.until(EC.element_to_be_clickable(	(By.CLASS_NAME, 'XCdOnb'))).click()
                restaurant.opening_h = WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,'e6MFBe'))).text
            except TimeoutException:
                pass
            
        if restaurant.opening_h == "NA":  
        #Get restaurant opening ours data    
            try:
                # Try Except needed because GoogleMaps has three types of storing opening hours data
                restaurant.opening_h = WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.XPATH,'/html/body/div[6]/div/div[9]/div[2]/div/div[2]/async-local-kp/div/div/div[1]/div/g-sticky-content-container/div/block-component/div/div[1]/div/div/div/div[1]/div/div/div[5]/g-flippy-carousel/div/div/ol/li[1]/span/div/div/div/div[2]/div/div[3]/div'))).text
            except TimeoutException:
                try: 
                    restaurant.opening_h = WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.XPATH,'/html/body/div[6]/div/div[9]/div[2]/div/div[2]/async-local-kp/div/div/div[1]/div/g-sticky-content-container/div/block-component/div/div[1]/div/div/div/div[1]/div/div/div[5]/g-flippy-carousel/div/div/ol/li[1]/span/div/div/div/div[6]/div'))).text
                except TimeoutException:
                    try:
                        restaurant.opening_h = WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,'e6MFBe'))).text
                    except TimeoutException:  
                        try:
                            restaurant.opening_h = WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,'a-h'))).text
                        except Exception:  
                            pass


        # Second type of opening hours widget needs to be closed before other info can be scrapped
        try:
            self.wait.until(EC.element_to_be_clickable(	(By.XPATH, '/html/body/div[12]/g-lightbox/div/div[2]/div[2]'))).click() 
        except TimeoutException:
            try:
                webdriver.ActionChains(self.driver).send_keys(Keys.ESCAPE).perform()
            except TimeoutException or ElementClickInterceptedException or ElementNotVisibleException:
                pass


        # Get total no. of reviews
        try:
            restaurant.total_reviews= (WebDriverWait(self.driver, 3).until(EC.visibility_of_element_located((By.CLASS_NAME,"z5jxId"))).text[:-8].replace('.', ''))
        except Exception:
            pass

        info = [{
                'name': restaurant.name,
                'address': restaurant.address,
                'opening_h':restaurant.opening_h,
                'total_reviews': restaurant.total_reviews,
                'avg_rating': restaurant.avg_rating,
                'description_banner_text': description_banner_text,
                'detailed_description':detailed_description,
                'service_option':restaurant.service_option

            }]
        print(info)
       #time.sleep(2)
        return pd.DataFrame.from_dict(info)

    def get_reviews(self, restaurant) -> dict:
        restaurant_reviews = []
        #Click on widget to open reviews section for restaurant
        try: 
            self.wait.until(EC.element_to_be_clickable(	(By.CLASS_NAME, 'rVwa1d'))).click()
        except ElementClickInterceptedException:
            self.wait.until(EC.element_to_be_clickable(	(By.CLASS_NAME, 'KYeOtb'))).click()
        
        time.sleep(5)

        #load the amount of reviews needed
        max_no_reviews = 200

        #If a restaurant has less reviews than the max_no_reviews, take all the reviews the restaurant has
        if int(restaurant.total_reviews) < max_no_reviews:
            max_no_reviews = int(restaurant.total_reviews)

        reviews_class = []
        service_options_class = []
        star_buttons_class = []

        print(f"{time.localtime()}: scroll start")
        scroll = self.driver.find_element(By.CLASS_NAME, "srp")
        for i in range(max_no_reviews-1):
            # Go to Reviews pane and scroll to the max_no_reviews            
            self.actions.move_to_element(scroll).perform()
            scroll.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.3)
            
        print(f"{time.localtime()}: DONE SCROLL" )
        
        # While the reviews_class does not have the expected no of reviews(max_ro_reviews), 
        # scroll for 20 times in the reviews pane to make sure it is loaded, and get the needed no of reviews
        while len(reviews_class) < max_no_reviews-1:   
            time.sleep(5) 
                      
            #Scroll 20 times
            for i in range(2):
                self.actions.move_to_element(scroll).perform()
                scroll.send_keys(Keys.PAGE_DOWN)

            reviews_class = self.driver.find_elements(By.CLASS_NAME,'jxjCjc')
            service_options_class = self.driver.find_elements(By.CLASS_NAME, "PV7e7")
            star_buttons_class= self.driver.find_elements(By.CLASS_NAME,'PuaHbe')     
                
        idx_dine_in = 0

        print(f"{time.localtime()}: Reviews Phase")
        for i in range(len(reviews_class)):
            #Get star rating of review
            star_rating = float(star_buttons_class[i].accessible_name.split()[1].replace(",","."))
            service_option = "NA"

            #Corelate reviews and service option  information
            if idx_dine_in < len(service_options_class):
                string_split = reviews_class[i].text.splitlines()
        
                if len(string_split) > 4 and service_options_class[idx_dine_in] == string_split[4]:
                    service_option = service_options_class[idx_dine_in]
                    idx_dine_in =+ 1                   

            # create rating tuple of review's star rating and service option
            tuple_rating = (star_rating, service_option)
            
            # return dict of review information
            restaurant_reviews.append( {
                'user': reviews_class[i].text.splitlines()[0],
                'name': restaurant.name ,
                'address': restaurant.address,
                'description': restaurant.description,
                'service_option': restaurant.service_option,
                'opening_h':restaurant.opening_h,
                'total_reviews': restaurant.total_reviews,
                'rating': tuple_rating
            })

            #Create df from dict list of restaurant reviews and append to csv file
        return pd.DataFrame.from_dict(restaurant_reviews)
            
            

list_rest_already_done = ['KOMA Singapore','Quay House','Les Amis','Whitegrass Restaurant',
'Bar-Roque Grill','Restaurant Ibid']

 #TODO Remove comment from below to be able to run
#if __name__ == "__main__":
   
    #s = Webscraper("Singapore", csv_outfile = "D:\HomeDSforBuss\EXAM\scraped_rv2.csv")
    #s.scraper(10,list_rest_already_done)

# Data Analysis and Modeling

In [1]:
import numpy as np
import pandas as pd
import time
import random
# Import pandas_profiling to see preliminary info about dataframes
from pandas_profiling import ProfileReport

#Set pandas options for ease of cleaning
pd.set_option('display.max_rows', 500)
pd.options.mode.chained_assignment = None

#import surprise package for recommender system 
import surprise
from surprise.reader import Reader
from surprise import Dataset
from surprise.model_selection import GridSearchCV

#cross_validation module
from surprise.model_selection import cross_validate


#Matrix Factorization Algorithms
from surprise import SVD
from surprise import NMF

#Packages for finding coordinates of restaurant locations
from geopy.geocoders import Nominatim, GoogleV3
import io
geolocator = Nominatim(user_agent="Geolocation")

# Visualisation
import altair as alt
# Visuals based on geocoordinates
import folium 

#Import corpora to use in LDA
from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA


#import nlp to clean for LDA
import preprocessor as prepro # twitter prepro
import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module
import en_core_web_sm
import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

import nltk.corpus #to use in building corpus for topic modelling

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

#comparing funct baseed on key
from functools import cmp_to_key #comparing function 

  from imp import reload


## Load Data

In [13]:
#Load Data 

#load reviews dataframe
reviews_df = pd.read_csv(r"D:\HomeDSforBuss\EXAM\scrapped_reviews_all.csv")

#load restaurant info dataframe
rest_info = pd.read_csv(r"D:\HomeDSforBuss\EXAM\data_restaurant_specific.csv", names = ['name', 'address', 'opening_h', 'total_reviews', 'overall_rating', 'type','description','service_options'])

In [14]:
#see how dataframe looks like
reviews_df.head(3)

Unnamed: 0,restaurant,address,description,service,hours,total_revs,usr_rating,usr_id
0,KOMA Singapore,"2 Bayfront Ave, # B1 - 67, Singapore 018972",Asian fusion restaurant,Service options: Dine-in · Takeaway · Delivery,"Hours:\nThursday 11:30 am–3 pm, 5 pm–12 am\nFr...",1325,"(5.0, 'NA')",29242
1,KOMA Singapore,"2 Bayfront Ave, # B1 - 67, Singapore 018972",Asian fusion restaurant,Service options: Dine-in · Takeaway · Delivery,"Hours:\nThursday 11:30 am–3 pm, 5 pm–12 am\nFr...",1325,"(5.0, 'NA')",20980
2,KOMA Singapore,"2 Bayfront Ave, # B1 - 67, Singapore 018972",Asian fusion restaurant,Service options: Dine-in · Takeaway · Delivery,"Hours:\nThursday 11:30 am–3 pm, 5 pm–12 am\nFr...",1325,"(5.0, 'NA')",3298


For data preparation, we will assign a unique id to each restaurant. Additionally, since, we know the scraper has taken each restaurant multiple times, we remove duplicate rows to properly format the dataframe as a facts table

In [15]:
#assign id number to restaurants so later they can be merged with the reviews dataframe
rest_info = rest_info.assign(restaurant_id=(rest_info['name']).astype('category').cat.codes)

#make sure to keep only last entry for each restaurant
restaurants_df = rest_info.drop_duplicates(subset=['name'], keep='last',ignore_index=True)

In [16]:
#see how dataframe looks like
restaurants_df.head(3)

Unnamed: 0,name,address,opening_h,total_reviews,overall_rating,type,description,service_options,restaurant_id
0,Quay House,"51 Circular Rd, Singapore 049406","Address: 51 Circular Rd, Singapore 049406",51.0,50,['Restaurant'],,Service options: Dine-in,123
1,Jaan By Kirk Westaway,"2 Stamford Rd, Level 70, Singapore 178882",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414.0,46,"['$$$$', 'Restaurant']","Modern, 40-seat European-British restaurant wi...",Service options: Dine-in · Takeaway · No-conta...,68
2,Cloudstreet,"84 Amoy St, Singapore 069903",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414.0,46,"['$$$$', 'Restaurant']",Sophisticated restaurant in a high-end hotel o...,Service options: Dine-in · Takeaway · No-conta...,31


## Restaurants Data

The restaurants_df will be used as a fact table where additional information about the restaurants which were scrapped for reviews, will be found. Thus, the restaurants df is used to provide of overview of the Singaprorean restaurants considered in this project. 

In [17]:
#Check shape and look of dataframe
print(restaurants_df.shape)
restaurants_df.head(3)

(184, 9)


Unnamed: 0,name,address,opening_h,total_reviews,overall_rating,type,description,service_options,restaurant_id
0,Quay House,"51 Circular Rd, Singapore 049406","Address: 51 Circular Rd, Singapore 049406",51.0,50,['Restaurant'],,Service options: Dine-in,123
1,Jaan By Kirk Westaway,"2 Stamford Rd, Level 70, Singapore 178882",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414.0,46,"['$$$$', 'Restaurant']","Modern, 40-seat European-British restaurant wi...",Service options: Dine-in · Takeaway · No-conta...,68
2,Cloudstreet,"84 Amoy St, Singapore 069903",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414.0,46,"['$$$$', 'Restaurant']",Sophisticated restaurant in a high-end hotel o...,Service options: Dine-in · Takeaway · No-conta...,31


Before moving on, some simple cleaning of certain fields that look messy based on the report will be conducted

In [18]:
profile = ProfileReport(restaurants_df, title="Restaurant Dataframe Profiling")
profile.to_notebook_iframe()

  from IPython.core.display import display


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
restaurants_df.opening_h = restaurants_df.opening_h.fillna('')
#if opening hours are not available, remove noise
restaurants_df['opening_h']=np.where([i.startswith(('Located','Phone', 'Address')) for i in restaurants_df['opening_h']],'',restaurants_df['opening_h'])
#Get only service options
restaurants_df.service_options = restaurants_df.service_options.fillna('')
restaurants_df['service_options'] = restaurants_df['service_options'].apply(lambda x: x[17:])
#Get theme of the restaurant from its type
restaurants_df['theme'] = restaurants_df['type'].apply(lambda x: x.split("', '")[-1][:-2] if"', '" in x else x[2:-2])
#Get price range symbols
restaurants_df['price_range'] = restaurants_df['type'].apply(lambda x: x.split("', '")[0][2:] if"', '" in x else x[2:-2])
restaurants_df['price_range'] = np.where((restaurants_df['price_range'] == restaurants_df['theme']),"NA",restaurants_df['price_range'])


In [25]:
restaurants_df['price_range'].unique()

array(['NA', '$$$$', '$$$', '$$', '$'], dtype=object)

In [26]:
#Since in profiler the char $ is consituting a problem and needs to be escaped a new df will be created for the raport
#In this df the $ char will be replaced by £
profiling_df = restaurants_df.copy()
profiling_df['price_range'] = profiling_df['price_range'].apply(lambda x: x.replace("$","£"))

In [27]:
profile = ProfileReport(profiling_df, title="Restaurant Dataframe Profiling")
profile.to_notebook_iframe()


  from IPython.core.display import display


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

From pofiling of the dataframe we can see there are some missing values in different fields: address, price range, description, type, overall rating. Since these missing values seem to be few we will decide later how they should be handled. Either manually added or in a different manner

*Note: there seems to be a correlation between overall rating and the description of the  restaurants*


Since location is a very important aspects for the restaurant industry, the future section will focus on it. Additionally, the popularity and quality of the restaurants will be analyzed with the help of the total number of reviews and the restaurants' avg. rating

In [28]:
#Check the three empty rows for address
restaurants_df[restaurants_df['address'].isna()]

Unnamed: 0,name,address,opening_h,total_reviews,overall_rating,type,description,service_options,restaurant_id,theme,price_range
82,LAVO Italian Restaurant And Rooftop Bar,,,,,[],,,79,,
125,Claudine Restaurant,,,,,[],,,30,,
179,Brasserie Gavroche,,,,,[],,,19,,


In [29]:
#Since all values seem to be missing, but only 3 rows are affected, the data is added manually
#Add manually the three rows that were empty
restaurants_df.at[82,'address']='10 Bayfront Ave, Tower 1, Level 57, Singapore 018956'
restaurants_df.at[82,'price_range']='$$$$'
restaurants_df.at[82,'theme']='Italian restaurant'
restaurants_df.at[82,'overall_rating']='4,3'
restaurants_df.at[82,'total_reviews']='2880'

restaurants_df.at[125,'address']='39C Harding Rd, Singapore 249541'
restaurants_df.at[125,'price_range']='NA'
restaurants_df.at[125,'theme']='Italian restaurant'
restaurants_df.at[125,'overall_rating']='4,4'
restaurants_df.at[125,'total_reviews']='Restaurant'
restaurants_df.at[125,'service_options']='Dine-in · No takeaway · No delivery'

restaurants_df.at[179,'address']='66 Tras St, Singapore 079005'
restaurants_df.at[179,'price_range']='$$$'
restaurants_df.at[179,'theme']='Italian restaurant'
restaurants_df.at[179,'overall_rating']='4,4'
restaurants_df.at[179,'total_reviews']='French restaurant'
restaurants_df.at[179,'service_options']='Dine-in · Kerbside pickup · Delivery'

In [30]:
#create coordinates column, initialized as 'None'
restaurants_df['coordinates']= 'None'

In [31]:
#Loop through addresses so that coordinated can be found

# Observed edge cases that need to be handles
#   tg pagar == Tanjong Pagar
#   removed #XX-XX number in address1

#observed problematic rows 96,59, 67, 78, 130
for row in range(restaurants_df.shape[0]):
    #TODO fix these
    if row in [59,67,78,96,130]:
        continue
    #if row == 163:
    if restaurants_df.at[row,'coordinates'] == None or restaurants_df.at[row,'coordinates'] == 'None':
        address = restaurants_df.at[row,'address']
        #address = address if ", Level" not in address else address.replace(", Level","")
        if "Tg" in address:
            address = address.replace("Tg","Tanjong")
        if "Rd" in address:
            address = address.replace("Rd ","Road")
        if "Blvd" in address:
            address =address.replace("Blvd", "Boulevard")
        #if "#" in address:
        #    address = address.replace("#","")
        if "St" in address:
            address = address.replace("St ", "Street")
        if "Eurasian Community House" in address:
            address = address.replace("Eurasian Community House", "")

        if "," in address:
            address1 = address.split(",")[0].strip().replace('#','')
            address2 = address.split(",")[-1].strip()
            
            if address2[-1].isdigit() and " " in address2:
                address2 = address2.split()[0]
        
            address = f"{address1} {address2}"
        
        location= geolocator.geocode(address)
        restaurants_df.at[row,'coordinates'] = location
             
            
            
            #print(geolocator.geocode(address))
        time.sleep(1.1)
    
        #print(f"{row}: {restaurants_df.at[row,'coordinates']}")
        

In [32]:
#Checking if any coordinates were not found. 
restaurants_df[restaurants_df['coordinates'].isna()]

Unnamed: 0,name,address,opening_h,total_reviews,overall_rating,type,description,service_options,restaurant_id,theme,price_range,coordinates
16,Yan Restaurant at National Gallery Singapore 宴中餐馆,"1 Saint Andrew's Road #05-02 National Gallery,...","Sunday\n(Lunar New Year)\n11:30 am–2:30 pm, 6–...",499.0,41,"['$$$', 'Chinese restaurant']",,Dine-in · Takeaway · Delivery,177,Chinese restaurant,$$$,
29,Swee Choon Tim Sum Restaurant,"183/185/187/189, Jln Besar, 191/193, Singapore...","Sunday\n(Lunar New Year)\n9 am–4 pm, 6 pm–4 am...",7302.0,42,"['$$', 'Dim sum restaurant']",Long-standing eatery offering classic Cantones...,Dine-in · Takeaway · No-contact delivery,151,Dim sum restaurant,$$,
80,Social Place Singapore,Social Place Singapore FORUM The Shopping Mall...,Monday\n(Lunar New Year)\nClosed\nTuesday\n(Lu...,482.0,43,['Chinese restaurant'],,Dine-in · Kerbside pickup · No-contact delivery,142,Chinese restaurant,,
93,"CÉ LA VI Singapore: Restaurant, SkyBar & Club ...","1 Bayfront Avenue Marina Bay Sands, Hotel, Tow...",Monday\n(Lunar New Year)\n11 am–11 pm\nHours m...,188.0,42,"['$$$$', 'Club']",,Dine-in · Takeaway · Delivery,39,Club,$$$$,
118,Original Sin Restaurant,"01-62 Jln Merah Saga, Chip Bee Gardens, Blk 43...",Monday\n(Lunar New Year)\nClosed\nHoliday open...,967.0,43,"['$$$', 'Mediterranean restaurant']",,Dine-in · Takeaway · No-contact delivery,112,Mediterranean restaurant,$$$,
162,Spago Dining Room,"10 Bayfront Avenue L57, Sands SkyPark, Hotel, ...","Monday\n(Lunar New Year)\n12–2:30 pm, 6–10 pm\...",1379.0,45,"['$$$$', 'American restaurant']",,Dine-in · Takeaway · Delivery,146,American restaurant,$$$$,
163,Blossom Restaurant,"Marina Bay Sands Hotel, Lobby Tower 2, #01-05/...","Monday\n(Lunar New Year)\n11:30 am–3:45 pm, 6–...",1162.0,47,['Fine dining restaurant'],,Dine-in · Takeaway · Delivery,15,Fine dining restaurant,,


In [33]:
#print all addresses that posed a problem
#restaurants_df[restaurants_df['coordinates'] == 'None']['address']
print([restaurants_df.at[59,'address'],
restaurants_df.at[67,'address'],
restaurants_df.at[78,'address'],
restaurants_df.at[96,'address'],
restaurants_df.at[130,'address']])

["1 St Andrew's Rd, #01-04 National Gallery, Singapore 178957", "1 St Andrew's Rd, #02–01 National Gallery, Singapore 178957", '80 Mount Pleasant Rd, Singapore 298334', '3 Park Ln, Singapore 798387', '25 Church St, #01-03, 25 Capital Square 3, Singapore 049482']


In [34]:
#manually fix addresses after trials 
#Specific edgecases for 5 values, thus the below is hardcoding the solution
restaurants_df.at[163,"coordinates"] = geolocator.geocode('2 Bayfront Avenue, Singapore')
restaurants_df.at[162,"coordinates"] = geolocator.geocode('10 Bayfront Avenue  Singapore')
restaurants_df.at[118,"coordinates"] = geolocator.geocode('Jln Merah Saga, Singapore')
restaurants_df.at[93,"coordinates"] = geolocator.geocode('1 Bayfront Avenue  Singapore')
restaurants_df.at[80,"coordinates"] = geolocator.geocode('Orchard Rd, #01-22 583, Singapore 238884')
restaurants_df.at[29,"coordinates"] = geolocator.geocode('Jalan Besar,Singapore')
restaurants_df.at[16,"coordinates"] = geolocator.geocode("1 Saint Andrew's Road Singapore")

restaurants_df.at[59,'coordinates'] = geolocator.geocode("1 Saint Andrew's Road Singapore")
restaurants_df.at[67,'coordinates'] = geolocator.geocode("1 Saint Andrew's Road Singapore")
restaurants_df.at[78,'coordinates'] = geolocator.geocode("Mount Pleasant Road, Singapore")
restaurants_df.at[96,'coordinates'] = geolocator.geocode("Park Lane, Singapore")
restaurants_df.at[130,'coordinates'] = geolocator.geocode("1 Saint Andrew's Road Singapore") 


In [35]:
#Extract longitude and latitude coord
restaurants_df['longitude'] = restaurants_df['coordinates'].apply(lambda x: x.longitude)
restaurants_df['latitude'] = restaurants_df['coordinates'].apply(lambda x: x.latitude)

In [36]:
#to be able to transform into float, replace , in overall_rating field
restaurants_df['overall_rating'] = restaurants_df['overall_rating'].apply(lambda x: str(x).replace(",","."))
#restaurants_df['total_reviews'] = restaurants_df['total_reviews'].apply(lambda x: str(x).replace(".",""))

In [37]:
#Fix issue detected when debugging
#Something went off here so the total reviews value should be at theme
restaurants_df.at[179,'theme'] =  restaurants_df.at[179,'total_reviews']
restaurants_df.at[125,'total_reviews'] = 123
restaurants_df.at[179,'total_reviews'] = 356

In [38]:
#check dtypes of needed columns
restaurants_df.dtypes

name                object
address             object
opening_h           object
total_reviews       object
overall_rating      object
type                object
description         object
service_options     object
restaurant_id        int16
theme               object
price_range         object
coordinates         object
longitude          float64
latitude           float64
dtype: object

In [39]:
#transform datatypes of needed columns
# using dictionary to convert specific columns
convert_dict = {'latitude': float,
                'longitude': float,
                'overall_rating':float,
                'total_reviews':int
                }
 
restaurants_df = restaurants_df.astype(convert_dict)
print(restaurants_df.dtypes)

name                object
address             object
opening_h           object
total_reviews        int32
overall_rating     float64
type                object
description         object
service_options     object
restaurant_id        int16
theme               object
price_range         object
coordinates         object
longitude          float64
latitude           float64
dtype: object


In [40]:
#Create initial map to see where all restaurants are located
map = folium.Map(location=[1.35, 103.86], zoom_start=11.2, control_scale=True)
for index, rest_location in restaurants_df.iterrows():
    
    folium.Marker([restaurants_df.at[index,"latitude"],
    restaurants_df.at[index,"longitude"]],
    tooltip = f"Rating:[{restaurants_df.at[index,'overall_rating']}]\n No. Reviews: [{int(restaurants_df.at[index,'total_reviews'])}]").add_to(map)

map

The next steps will infer the popularity and quality of restaurants

In [41]:
#Inspect how many restaurants are rated over 4
restaurants_df[restaurants_df['overall_rating'] > 4]['name'].count()

176

In [42]:
#create histogram to show frequency of values for overall rating
alt.Chart(restaurants_df[['overall_rating','name']]).mark_bar().encode(
    alt.X("overall_rating:Q", bin=True),
    y='count()',
)

  for col_name, dtype in df.dtypes.iteritems():


In [43]:
#Based on histogram above make 4 categories of restaurants
restaurants_df['quality_category'] = ""

for row in range(restaurants_df.shape[0]):
    if restaurants_df.at[row,"overall_rating"] < 4.2:
        restaurants_df.at[row,'quality_category'] = "Poor"
    elif restaurants_df.at[row,"overall_rating"] < 4.6:
        restaurants_df.at[row,'quality_category'] = "Average"
    elif restaurants_df.at[row,"overall_rating"] < 4.8:
        restaurants_df.at[row,'quality_category'] = "High"
    else:
        restaurants_df.at[row,'quality_category'] = "Excellent"


In [44]:
#Create histogram based on no of reviews to infer popularity
alt.Chart(restaurants_df[['total_reviews','name']]).mark_bar().encode(
    alt.X("total_reviews:Q", bin=alt.Bin(extent=[0, 8000], step=200)),
    y='count()',
)

  for col_name, dtype in df.dtypes.iteritems():


In [45]:
#Based on histogram above make 3 categories of restaurants
restaurants_df['popularity_category'] = ""

for row in range(restaurants_df.shape[0]):
    if restaurants_df.at[row,"total_reviews"] < 200:
        restaurants_df.at[row,'popularity_category'] = "Least Popular"
    elif restaurants_df.at[row,"total_reviews"] < 1800:
        restaurants_df.at[row,'popularity_category'] = "Average"
    else:
        restaurants_df.at[row,'popularity_category'] = "Very Popular"


Since we have now both categories (popularity and quality) they will be mapped against each other on the map, to see which type of quality restaurants are the most popular

In [46]:
# create helper dictionaries for map
#shows quality
icon_markers = {'Excellent':'star', 'High':'glyphicon glyphicon-arrow-up', 'Average':'glyphicon glyphicon-resize-small', 'Poor':'glyphicon glyphicon-arrow-down'}
#shows popularity
icon_colors = {'Least Popular': 'lightgray' , 'Average':'gray', 'Very Popular':'lightblue'}

In [47]:
# create quality and popularity map
locations_by_popularity_quality = folium.Map(location=[1.35, 103.86], zoom_start=11.2, control_scale=True)

for index, rest_location in restaurants_df.iterrows():

    restaurant_quality = restaurants_df.at[index,'quality_category']
    icon_marker = icon_markers[restaurant_quality]

    restaurant_popularity = restaurants_df.at[index,'popularity_category']
    icon_color= icon_colors[restaurant_popularity]
    
    folium.Marker([restaurants_df.at[index,"latitude"],
    restaurants_df.at[index,"longitude"]],
    tooltip = f"Rating:[{restaurants_df.at[index,'overall_rating']}]\\n No. Reviews: [{int(restaurants_df.at[index,'total_reviews'])}]",
    icon=folium.Icon(color=icon_color, icon=icon_marker)).add_to(locations_by_popularity_quality)

locations_by_popularity_quality

*Note: most popular restaurants are average with one exception*

In [48]:
restaurants_df.head(3)

Unnamed: 0,name,address,opening_h,total_reviews,overall_rating,type,description,service_options,restaurant_id,theme,price_range,coordinates,longitude,latitude,quality_category,popularity_category
0,Quay House,"51 Circular Rd, Singapore 049406",,51,5.0,['Restaurant'],,Dine-in,123,Restaurant,,"(TCC (Circular Road), 51, Circular Road, Clark...",103.849595,1.285859,Excellent,Least Popular
1,Jaan By Kirk Westaway,"2 Stamford Rd, Level 70, Singapore 178882",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414,4.6,"['$$$$', 'Restaurant']","Modern, 40-seat European-British restaurant wi...",Dine-in · Takeaway · No-contact delivery,68,Restaurant,$$$$,"(Swissôtel The Stamford, 2, Stamford Road, Civ...",103.853399,1.293317,High,Average
2,Cloudstreet,"84 Amoy St, Singapore 069903",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414,4.6,"['$$$$', 'Restaurant']",Sophisticated restaurant in a high-end hotel o...,Dine-in · Takeaway · No-contact delivery,31,Restaurant,$$$$,"(84, Amoy Street, Chinatown, Outram, Singapore...",103.846837,1.280865,High,Average


In [49]:
popular_restaurants = restaurants_df[restaurants_df['popularity_category'] == 'Very Popular']


In [50]:
# create quality and popularity map
locations_by_popularity_quality = folium.Map(location=[1.35, 103.86], zoom_start=13, control_scale=True)

for index, rest_location in popular_restaurants.iterrows():

    restaurant_quality = popular_restaurants.at[index,'quality_category']
    icon_marker = icon_markers[restaurant_quality]

    restaurant_popularity = popular_restaurants.at[index,'popularity_category']
    icon_color= icon_colors[restaurant_popularity]
    
    folium.Marker([popular_restaurants.at[index,"latitude"],
    popular_restaurants.at[index,"longitude"]],
    tooltip = f"Rating:[{popular_restaurants.at[index,'overall_rating']}]\\n No. Reviews: [{int(popular_restaurants.at[index,'total_reviews'])}]",
    icon=folium.Icon(color=icon_color, icon=icon_marker)).add_to(locations_by_popularity_quality)

locations_by_popularity_quality

To get to know more about the data, it is interesting to look at the price range for the categories

In [51]:
#WORKS
bars = alt.Chart(restaurants_df[['name','price_range','popularity_category']]).mark_bar().encode(
    x=alt.X('count(name)):Q', stack='zero'),
    y=alt.Y('price_range:N'),
    color=alt.Color('popularity_category')
)

text = alt.Chart(restaurants_df[['name','price_range','popularity_category']]).mark_text(dx=-15, dy=3, color='white').encode(
    x=alt.X('count(name):Q', stack='zero'),
    y=alt.Y('price_range:N'),
    detail='popularity_category:N',
    text=alt.Text('count(name):Q', format='.1f')
)

bars + text

  for col_name, dtype in df.dtypes.iteritems():


Since we saw in the report that 78 values are missing, we can map them with the help of LDA. 
To do so we will use the restaurant descriptions and themes.

First let's see if the themes could help.

In [52]:
alt.Chart(restaurants_df[['name','price_range','theme']]).mark_bar().encode(
    x=alt.X('count(name)', stack="normalize"),
    y='theme',
    color='price_range'
)

  for col_name, dtype in df.dtypes.iteritems():


In [53]:
#check again how many values are missing for price range
restaurants_df.groupby(['price_range'])['name'].count()

price_range
$        1
$$      31
$$$     46
$$$$    30
NA      76
Name: name, dtype: int64

In [54]:
restaurants_df['clean_description'] = restaurants_df['theme'] + ' ' + restaurants_df['description'] + ' ' + restaurants_df['name']
#since there are so many but yet it is feasible to do it by hand i will check for each restaurant from its name and assign a description
    

In [55]:
# Function to clean text of Position column which now is in clean_text_position

# write everything into one function that can be re-used later
def text_cleaner(texts,stop_words):

#remove all punctuation  
  texts_clean = texts.apply(lambda t: str(t).lower())
 # texts_clean = texts.apply(lambda t: re.sub(r'[^a-zA-Z0-9]', ' ', t))
  
  clean_container = []

  # progress bar
  
   
  # nlp pipeline. tokanizes inside... 
  nlp_pipeline =  nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"])

  for text in nlp_pipeline:
    #text is an entire string (observation from Position)

    #here i just lowercase every element from string
    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct
          and (str(token).lower() not in stop_words) ==True
          ]

    
    clean_container.append(" ".join(txt))
   
  
  
  return clean_container



Looks like some of the themes are dominated by one price category. We will therefore go ahead with the LDA

In [56]:
#stopwords to be removed from text
extra_stopwords = ["with", "a", "restaurant"]
stop_words = nltk.corpus.stopwords.words('english')
# pre-process text
restaurants_df['clean_description'] = text_cleaner(restaurants_df['clean_description'],stop_words)




In [57]:
# Merge description and theme to get the most our of the descriptors
restaurants_df['tokens'] = restaurants_df['clean_description'].apply(lambda t: str(t).split())

In [58]:
#Check how df looks like
restaurants_df.head(3)

Unnamed: 0,name,address,opening_h,total_reviews,overall_rating,type,description,service_options,restaurant_id,theme,price_range,coordinates,longitude,latitude,quality_category,popularity_category,clean_description,tokens
0,Quay House,"51 Circular Rd, Singapore 049406",,51,5.0,['Restaurant'],,Dine-in,123,Restaurant,,"(TCC (Circular Road), 51, Circular Road, Clark...",103.849595,1.285859,Excellent,Least Popular,,[nan]
1,Jaan By Kirk Westaway,"2 Stamford Rd, Level 70, Singapore 178882",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414,4.6,"['$$$$', 'Restaurant']","Modern, 40-seat European-British restaurant wi...",Dine-in · Takeaway · No-contact delivery,68,Restaurant,$$$$,"(Swissôtel The Stamford, 2, Stamford Road, Civ...",103.853399,1.293317,High,Average,restaurant modern seat european british restau...,"[restaurant, modern, seat, european, british, ..."
2,Cloudstreet,"84 Amoy St, Singapore 069903",Hours:\nCloses soon ⋅ 10.30 pm ⋅ Opens 11.30 a...,414,4.6,"['$$$$', 'Restaurant']",Sophisticated restaurant in a high-end hotel o...,Dine-in · Takeaway · No-contact delivery,31,Restaurant,$$$$,"(84, Amoy Street, Chinatown, Outram, Singapore...",103.846837,1.280865,High,Average,restaurant sophisticated restaurant high end h...,"[restaurant, sophisticated, restaurant, high, ..."


In [59]:
# Create a corpus from the tokens: dictionary
dictionary = Dictionary(restaurants_df['tokens'])
# filter out low-frequency / high-frequency stuff
dictionary.filter_extremes(no_below=3, no_above=0.5)
#finalize corpus from dict
corpus = [dictionary.doc2bow(doc) for doc in restaurants_df['tokens']]

In [60]:
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=4, workers = 4, passes=10)
# Create variable to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [61]:
 # Let's Visualize
pyLDAvis.display(lda_display)

In [62]:
from pprint import pprint
# Print the Keyword in the 4 topics to be able to set them
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.146*"restaurant" + 0.063*"trendy" + 0.063*"urban" + 0.063*"farm" + '
  '0.060*"free" + 0.059*"plus" + 0.059*"eco" + 0.059*"products" + '
  '0.059*"gluten" + 0.059*"friendly"'),
 (1,
  '0.149*"restaurant" + 0.059*"italian" + 0.052*"bar" + 0.046*"french" + '
  '0.043*"plus" + 0.036*"wine" + 0.036*"eatery" + 0.036*"cocktails" + '
  '0.035*"dishes" + 0.034*"japanese"'),
 (2,
  '0.165*"restaurant" + 0.040*"cuisine" + 0.039*"singapore" + 0.036*"french" + '
  '0.036*"fine" + 0.036*"dining" + 0.030*"offering" + 0.030*"decor" + '
  '0.028*"dishes" + 0.027*"cocktails"'),
 (3,
  '0.158*"restaurant" + 0.053*"cuisine" + 0.053*"offering" + 0.046*"hotel" + '
  '0.040*"high" + 0.040*"sum" + 0.033*"sophisticated" + 0.033*"end" + '
  '0.033*"cantonese" + 0.033*"dim"')]


In [89]:
#Based on visual and words the categories were mapped as per below
lda_dict = {0:'$', 
            1:'$$', 
            2:'$$$$', 
            3:'$$$',
            '-':'-'}

In [90]:
#DEFINE classifier function
#get all topics probabilitites
def classifier(texts):

  #get probabilities list for each observation in list of tuples format [(cluster_no, probability)...]
  texts_clean = texts.apply(lambda t: lda_model[dictionary.doc2bow(t)])

  # First: sort probabilities list by probability in desc order (list of tuples e.g. (cluster_no, probabilty))
  # Second: get cluster number of first element (highest probability) 
  # Third: convert cluster number (lda_dict key) to predefined cluser name (lda_dict value)

  ##############################################
  texts_clean = texts_clean.apply( lambda t: lda_dict[sorted(t, reverse=True, key=cmp_to_key(lambda item1, item2: item1[1] - item2[1]))[0][0]] )

  return texts_clean

In [91]:
#Categorize data based on LDA
restaurants_df['price_range_LDA'] = classifier(restaurants_df['tokens'])

#add the new category to price range missing values
for idx in range(restaurants_df.shape[0]):
    if restaurants_df.at[idx,'price_range'] == 'NA':
        restaurants_df.at[idx,'price_range'] = restaurants_df.at[idx,'price_range_LDA']
    else: 
        continue

#drop helper column
restaurants_df = restaurants_df.drop('price_range_LDA', axis=1)

In [92]:
#Check normalized split between popularity categories
alt.Chart(restaurants_df[['name','price_range','popularity_category']]).mark_bar().encode(
    x=alt.X('count(name)', stack="normalize"),
    y='popularity_category',
    color='price_range'
)

  for col_name, dtype in df.dtypes.iteritems():


## Reviews Data

Since the reviews dataframe is quite large, the pandas_profiling module will be used to gain insights on the dataframe

In [93]:
#Check lenght and width or dataframe
reviews_df.shape

(169239, 11)

It can be noticed that the [dining_option] column contains 167344 NA values, thus it will be removed as it does not offer much meaning.

When it comes to the [rating] column, we notice some values of 6. This is due to an initial bug in the scraper. Since for these tweets are very few (1.1%) they will be removed.

Additionally, since in the recommender system we wish to build is collaborative. We will remain with the three columns we are intrested in: usr_id, restaurant_id, rating.

To do so, we first need to get the restaurant id from the rest_info dataframe.

*Note: it is interesting to see that there are only 30654 unique users. This may be helpful later on*

In [94]:
reviews_df.head(3)

Unnamed: 0,restaurant,address,description,service,hours,total_revs,usr_rating,usr_id,dining_option,rating,restaurant_id
0,KOMA Singapore,"2 Bayfront Ave, # B1 - 67, Singapore 018972",Asian fusion restaurant,Service options: Dine-in · Takeaway · Delivery,"Hours:\nThursday 11:30 am–3 pm, 5 pm–12 am\nFr...",1325,"(5.0, 'NA')",29242,'NA,5.0,71
1,KOMA Singapore,"2 Bayfront Ave, # B1 - 67, Singapore 018972",Asian fusion restaurant,Service options: Dine-in · Takeaway · Delivery,"Hours:\nThursday 11:30 am–3 pm, 5 pm–12 am\nFr...",1325,"(5.0, 'NA')",20980,'NA,5.0,71
2,KOMA Singapore,"2 Bayfront Ave, # B1 - 67, Singapore 018972",Asian fusion restaurant,Service options: Dine-in · Takeaway · Delivery,"Hours:\nThursday 11:30 am–3 pm, 5 pm–12 am\nFr...",1325,"(5.0, 'NA')",3298,'NA,5.0,71


In [95]:
#Get dining option and rating as separate columns
reviews_df['dining_option'] = reviews_df['usr_rating'].apply(lambda x: x[6:-2])
reviews_df['rating'] = reviews_df['usr_rating'].apply(lambda x: float(x[1:4].replace(",",".")))

In [96]:
profile = ProfileReport(reviews_df, title="Reviews Profiling Report")
profile.to_notebook_iframe()

  from IPython.core.display import display


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [97]:
#Get restaurant id
reviews_df['restaurant_id'] = reviews_df['restaurant'].map(restaurants_df.set_index('name')['restaurant_id'])

#simplify dataframe
reviews_df_collab = reviews_df[['usr_id','restaurant_id','rating']]

#keep rows that do not have 6 as rating
reviews_df_collab = reviews_df_collab[reviews_df_collab.rating != 6]

In [98]:
#check shape and look of dataframe
print(reviews_df_collab.shape)
reviews_df_collab.head()

(167344, 3)


Unnamed: 0,usr_id,restaurant_id,rating
0,29242,71,5.0
1,20980,71,5.0
2,3298,71,5.0
3,192,71,4.0
4,9719,71,3.0


In [99]:
reviews_df_collab

Unnamed: 0,usr_id,restaurant_id,rating
0,29242,71,5.0
1,20980,71,5.0
2,3298,71,5.0
3,192,71,4.0
4,9719,71,3.0
...,...,...,...
169234,23315,101,5.0
169235,15467,101,5.0
169236,6152,101,5.0
169237,18723,101,5.0


### Matrix Factorisation

In [100]:
#create reader opbject and input rating scale 
reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
data = Dataset.load_from_df(reviews_df_collab, reader)

#create train and test datasets from full data
train = data.build_full_trainset()
test = train.build_anti_testset()

##### Singular Value Decomposition (SVD)

In [120]:
#Build generic SVD and check its RMSE and MAE
svd = SVD(n_epochs=10)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.4926  0.5050  0.5063  0.5090  0.4971  0.5200  0.5056  0.5129  0.5013  0.5093  0.5059  0.0074  
MAE (testset)     0.2643  0.2683  0.2636  0.2718  0.2646  0.2759  0.2708  0.2689  0.2683  0.2673  0.2684  0.0036  
Fit time          1.82    1.38    1.45    1.19    1.20    1.14    1.18    1.19    1.17    1.17    1.29    0.20    
Test time         0.20    0.18    0.21    0.14    0.16    0.14    0.13    0.13    0.14    0.14    0.16    0.03    


In [121]:
print("Average MAE: ", np.average(results["test_mae"]))
print("Average RMSE: ", np.average(results["test_rmse"]))

Average MAE:  0.26837362347496485
Average RMSE:  0.5059204524656783


In [128]:
param_grid = {
  'n_factors': [30,50,80,100],
  'n_epochs': [20,30,40,60,80]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)
 
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.4369457884024457
{'n_factors': 100, 'n_epochs': 80}


In [129]:
# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']

In [130]:
# We'll use the  SVD algorithm.
svd = SVD(n_factors=best_factor, n_epochs=best_epoch)

In [131]:
# Train the algorithm on the trainset
svd.fit(train)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x211057fcb20>

In [132]:
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.4294  0.4473  0.4441  0.4346  0.4534  0.4163  0.4368  0.4220  0.4386  0.4431  0.4366  0.0109  
MAE (testset)     0.1668  0.1695  0.1704  0.1689  0.1734  0.1619  0.1705  0.1626  0.1698  0.1692  0.1683  0.0034  
Fit time          7.68    6.26    6.12    6.08    6.15    6.12    6.10    6.12    6.09    6.12    6.28    0.47    
Test time         0.13    0.10    0.10    0.10    0.10    0.10    0.10    0.10    0.10    0.10    0.10    0.01    


In [None]:
print("Average MAE: ", np.average(results["test_mae"]))
print("Average RMSE: ", np.average(results["test_rmse"]))

Average MAE:  0.17224326275393614
Average RMSE:  0.43607455902997083


In [None]:
predictions = svd.test(test)

In [None]:
#Data collector
from collections import defaultdict #data colector

In [None]:
def predict_top_n(predictions, userId, restaurants_df, reviews_df, n = 3):
    '''Return the top N restaurants to recommend for a user and the user's history'''
    #Peart I.: Surprise docomuntation
    
    #Check users
    usr_list = list(set([x[0] for x in predictions ]))


    #First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    #Then sort the predictions for each user and retrieve the n highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = user_ratings[: n ]
    
    #Tells how many restaurants the user has rated before
    user_data = reviews_df[reviews_df.usr_id == (userId)]
        
    #Data Frame with predictions. 
    preds_df = pd.DataFrame([(id, pair[0],pair[1]) for id, row in top_n.items() for pair in row],
                        columns=["usr_id" ,"restaurant_id","rat_pred"])
    
    
    # Return pred_usr, i.e. top N recommended restaurants with their details. 
    pred_usr = preds_df[preds_df["usr_id"] == (userId)].merge(restaurants_df[['restaurant_id','name','price_range','address','theme','quality_category']], how = 'left', left_on = 'restaurant_id', right_on = 'restaurant_id')
    pred_usr = pred_usr.drop_duplicates(subset=['name'])        
    #Return hist_usr as top N historically rated restaurants 
    hist_usr = reviews_df[reviews_df.usr_id == (userId) ].sort_values("rating", ascending = False).merge\
    (restaurants_df, how = 'left', left_on = 'restaurant_id', right_on = 'restaurant_id')
        
    if userId in usr_list: 
        print('User {0} has already rated {1} restaurants.'.format(userId, user_data.shape[0]))
    
    # alternative if userId is completly new
    else:
        popular_restaurant = list(restaurants_df[restaurants_df['popularity_category'] == 'Very Popular']['restaurant_id'])
        rand_recommended_restaurants = random.sample(popular_restaurant, n)

        for idx in range(len(rand_recommended_restaurants)):
            pred_usr.at[idx,'usr_id'] = userId
            pred_usr.at[idx,'rat_pred'] = idx
            pred_usr.at[idx,'name'] = restaurants_df.at[idx,'name']
            pred_usr.at[idx,'price_range'] = restaurants_df.at[idx,'price_range']
            pred_usr.at[idx,'address'] = restaurants_df.at[idx,'address']
            pred_usr.at[idx,'theme'] = restaurants_df.at[idx,'theme']
            pred_usr.at[idx,'quality_category'] = restaurants_df.at[idx,'quality_category']
            

    return hist_usr, pred_usr

In [None]:
preexisting_users = list(set([x[0] for x in predictions ]))
print(f"pre-existing users are between {min(preexisting_users)} and {max(preexisting_users)}")

pre-existing users are between 0 and 30653


In [None]:
#Check prediction HERE
hist_SVD, pred_SVD = predict_top_n(predictions, restaurants_df = restaurants_df, userId = 35, reviews_df= reviews_df, n=5)

User 35 has already rated 76 restaurants.


In [141]:
hist_SVD.head(3)

Unnamed: 0,restaurant,address_x,description_x,service,hours,total_revs,usr_rating,usr_id,dining_option,rating,...,service_options,theme,price_range,coordinates,longitude,latitude,quality_category,popularity_category,clean_description,tokens
0,Les Amis,"1 Scotts Rd, #01 - 16 Shaw Centre, Singapore 2...",$$$$,Service options: Dine-in · No takeaway · No de...,"Hours:\nFriday 12–2 pm, 7–9 pm\nSaturday 12–2 ...",619,"(6, 'NA')",35,A,6.0,...,Dine-in · No takeaway · No delivery,Haute French restaurant,$$$$,"(Les Amis, 1, Scotts Road, Orchard, Singapore,...",103.831312,1.306608,High,Average,haute french restaurant refined chandelier lit...,"[haute, french, restaurant, refined, chandelie..."
1,Lolla,"22 Ann Siang Rd, Singapore 069702",$$$,Service options: Dine-in · Takeaway · Delivery,,372,"(5.0, 'NA')",35,'NA,5.0,...,Dine-in · Takeaway · Delivery,Restaurant,$$$,"(Lolla, 22, Ann Siang Road, Chinatown, Outram,...",103.845626,1.281032,Average,Average,,[nan]
2,Amò,"33 Hongkong St, Singapore 059672",$$$,Service options: Dine-in · Kerbside pickup · N...,Reservations: tablecheck.com\n Providers,893,"(5.0, 'NA')",35,'NA,5.0,...,Dine-in · Kerbside pickup · No-contact delivery,Italian restaurant,$$$,"(33, Hongkong Street, Clarke Quay, Singapore R...",103.847114,1.287446,Average,Average,italian restaurant sleek italian eatery whippi...,"[italian, restaurant, sleek, italian, eatery, ..."


In [140]:
reviews_df[reviews_df['usr_id'] == 35].head(3)

Unnamed: 0,restaurant,address,description,service,hours,total_revs,usr_rating,usr_id,dining_option,rating,restaurant_id
1768,Les Amis,"1 Scotts Rd, #01 - 16 Shaw Centre, Singapore 2...",$$$$,Service options: Dine-in · No takeaway · No de...,"Hours:\nFriday 12–2 pm, 7–9 pm\nSaturday 12–2 ...",619,"(6, 'NA')",35,A,6.0,88
17111,Beast & Butterflies,"90 Robertson Quay, Level 1, Singapore 238259",$$,Service options: Dine-in · Takeaway · Delivery,Hours:\nSaturday 6:30 am–11:30 pm\nSunday\n(Lu...,680,"(2.0, 'NA')",35,'NA,2.0,12
18204,Beast & Butterflies,"90 Robertson Quay, Level 1, Singapore 238259",$$,Service options: Dine-in · Takeaway · Delivery,Hours:\nSaturday 6:30 am–11:30 pm\nSunday\n(Lu...,680,"(2.0, 'NA')",35,'NA,2.0,12


In [138]:
#Show Prediction
pred_SVD

Unnamed: 0,usr_id,restaurant_id,rat_pred,name,price_range,address,theme,quality_category
0,35,106,5.0,NAEUM Restaurant,$$,"161 Telok Ayer St, Singapore 068615",Restaurant,High
1,35,131,5.0,Restaurant Labyrinth,$$$,"8 Raffles Avenue, Esplanade, Mall, #02 - 23, S...",Singaporean restaurant,High
2,35,179,5.0,Zén,$$$$,"41 Bukit Pasoh Rd, Singapore 089855",Restaurant,Average
3,35,15,5.0,Blossom Restaurant,$$,"Marina Bay Sands Hotel, Lobby Tower 2, #01-05/...",Fine dining restaurant,High
4,35,116,5.0,Po,$$$,"320 Havelock Rd, Singapore 169628",Singaporean restaurant,Average
