# Web-Scrapping Google Reviews
***
<p align='text-align: justify;'> Like in most project, data will not always be ready for use and clean. In this case, the data sources are the <b>Google reviews</b> and ratings of <i>Dumpling House</i> and <i>60 other restaurants</i>, located in <i>Chinatown, Toronto</i>. There are two ways to access the data. Firstly, <b>Application Programming Interface (API)</b>, such as Yelp and Google APIs. It can be accessed quite easily and quickly; however, those two resources are restricted to Developer and Business owners only. Another way is to use <b>web-scrapping</b> to collect raw unstructure data. Web-scrapping is leveraging text-based mark-up languages properties to extract data from websites.</p>

***
## Project Set-Up
> - Importing all the **required libraries** for this project.

In [None]:
import pandas as pd
import numpy as np
import os
import re

from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver

pd.options.mode.chained_assignment = None

***
## Web-Scrapping Excercise
> <p align='text-align: justify;'> Performing <b>automation</b> by coding a <i>Web-Crawling Bot</i> allows to collect a very large amount of data online while saving some time, it takes only around 4 to 5 hours to run the web-scrapping model below (without any human presence needed). </p>
>
> ***
> ### Scrapping URLs
> First and foremost we need to **scrap the URLs** of the 60 restaurants we want to analyze in Chinatown, Toronto. In this case represented by being the closest in the area of *The Dumpling House*.

In [None]:
def get_url(url_set, i, url_dict):
    for result in url_set:
        result_name = result.find('a', class_='a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd')["aria-label"]
        result_url = result.find('a', class_='a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd')["href"]
        url_dict['Restaurant name'].append(result_name)
        url_dict['Restaurant URL'].append( result_url)

def turn_to_df(url_dict):
    df_url=pd.DataFrame(url_dict)
    df_url.to_csv('data/df_url.csv')
    print(df_url)

In [None]:
# WEB SCRAPPING GOOGLE RESTAURANTS URLs
'SELENIUM - BEAUTIFUL SOUP : Classic Web Scrapping'
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.action_chains import ActionChains

url_dict = {'Restaurant name': [],'Restaurant URL': []}
path = r"C:\Users\...\PROJECT\CHROMEDRIVER\chromedriver.exe" 
driver = webdriver.Chrome(path)
timeout = 15

area = "https://www.google.ca/maps/search/Restaurants/@43.6540536,-79.3996533,16.48z/data=!3m1!4b1!4m8!2m7!3m5!2sDumpling+House+Restaurant!3s0x882b34c396514419:0xd36007a96d7731be!4m2!1d-79.3986693!2d43.6538068!6e5"
driver.get(area)

for i in range (1,4):
    element_present = EC.visibility_of_all_elements_located((By.XPATH,'/html/body/div[3]/div[9]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/button'))
    WebDriverWait(driver, timeout).until(element_present)
    sleep(2) # extra-cautiousness
    pane = driver.find_element_by_xpath('/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[1]/div[3]/div/a') 
    total_number_of_restaurant=4
    
    # Scroll as many times as necessary to load all reviews
    for i in range(0,total_number_of_restaurant):
            pane.send_keys(Keys.END)
            sleep(1) 
    
    response = BeautifulSoup(driver.page_source, 'html.parser')
    urls = response.find_all('div', class_='V0h1Ob-haAclf OPZbO-KE6vqe o0s21d-HiaYvf')
    
    get_url(urls, i, url_dict)
    
    button_next = driver.find_element_by_xpath('/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[2]/div/div[1]/div/button[2]')
    button_next.click()
    
turn_to_df(url_dict)
print("FINISHED")

>***
> ### Scrapping Reviews
> Then we can proceed to the scrapping of all the **Google Reviews** of each restaurant to build our dataset. 

In [None]:
def get_review_summary(result_set, rev_dict, name):
    for result in result_set:
        review_rate = result.find('span', class_='ODSEW-ShBeI-H1e3jb')["aria-label"]
        review_time = result.find('span',class_='ODSEW-ShBeI-RgZmSc-date').text
        review_text = result.find('span',class_='ODSEW-ShBeI-text').text
        rev_dict['Review Rate'].append(review_rate)
        rev_dict['Review Time'].append(review_time)
        rev_dict['Review Text'].append(review_text)
        rev_dict['Restaurant name'].append(name)

def turn_to_df(rev_dict):
    df_reviews=pd.DataFrame(rev_dict)
    df_reviews.to_csv('data/df_reviews.csv')
    return(df_reviews)

In [None]:
# WEB SCRAPPING GOOGLE REVIEWS
'SELENIUM - BEAUTIFUL SOUP : Classic Web Scrapping'  
from selenium.webdriver.support import expected_conditions as EC 

rev_dict = {'Restaurant name' : [],'Review Rate': [],'Review Time': [],'Review Text' : []}
df_url = pd.read_csv('data/df_url.csv')

for i in range(0,1):
    gmaps_url = df_url.loc[i,"Restaurant URL"] 
    driver.get(gmaps_url) # navigate to web page
    
    element_present = EC.visibility_of_all_elements_located((By.XPATH,'/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]/span[1]/button'))
    WebDriverWait(driver, timeout).until(element_present)
    button_rev = driver.find_element_by_xpath('/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]/span[1]/button')
    button_rev.click()
    sleep(2) # extra-cautiousness
    
    element_present = EC.visibility_of_all_elements_located((By.XPATH,'/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[7]/div[2]/button'))
    WebDriverWait(driver, timeout).until(element_present)
    button_sort = driver.find_element_by_xpath('/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[7]/div[2]/button')
    button_sort.click()
    sleep(2)
    
    button_newest = driver.find_element_by_xpath('/html/body/div[3]/div[3]/div[1]/ul/li[2]/div[3]/div[1]')
    driver.execute_script("arguments[0].click();", button_newest)
    sleep(1)
    
    # Find the total number of reviews
    total_number_of_reviews = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(" ")[0]
    total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)

    # Find scroll layout
    scrollable_div = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]')

    # Scroll as many times as necessary to load all reviews
    for j in range(0,(round(total_number_of_reviews/10 - 1))):
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
            sleep(3)

    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = response.find_all('div', class_='ODSEW-ShBeI NIyLF-haAclf gm2-body-2')
    
    name = df_url.loc[i,"Restaurant name"]
    get_review_summary(reviews, rev_dict, name)

turn_to_df(rev_dict)
print("FINISHED")

***