In [1]:
from datetime import datetime, timedelta

import pandas as pd

import math 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

import time

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer #Tokenizer that takes only alphanumerical characters (no punctuation)
from nltk.util import ngrams

from bs4 import BeautifulSoup
import requests

In [100]:
def take_french_review(review_text_object):
    """Takes a text review object as input
    Returns the French version of the review only"""
    if review_text_object[:8] == "(Traduit":
        return review_text_object.replace("(Traduit par Google) ", "").replace("\xa0", "").split("(Avis d'origine)")[0]
    else:
        return review_text_object.replace("(Traduit par Google) ", "").replace("\xa0", "")
    
    
def find_nb_scroll(N):
    """Takes N an int, number of reviews that was left
    Returns number of scrolls needed to load all reviews"""
    if N%10 == 0:
        return int(N/10 - 1)
    else:
        return math.floor(N/10)
    
    
def get_review_summary(result_set):
    """Input: Text object, raw output of HTML data of the page
    Output: Pandas dataframe with one line per review with rate, time and comment"""
    rev_dict = {
        'Review Rate': [],
        'Review Time Amount' : [],
        'Review Time Period' : [],
        'Review Text' : []
                }
    for result in result_set:
        try:
            review_rate = int(result.find('span', class_='Fam1ne EBe2gf')["aria-label"][7])
        except:
            review_rate = ""
        try:
            review_time_amount = int(result.find('span',class_='dehysf lTi8oc').text[7:].replace('\xa0', ' ').replace('une', '1').replace('un', '1').split()[0])
        except:
            review_time_amount = ""
        try:
            review_time_period = result.find('span',class_='dehysf lTi8oc').text[7:].replace('\xa0', ' ').replace('une', '1').split()[1]
        except:
            review_time_period = ""
        try:
            review_text = result.find('span',class_='review-full-text').text
            review_text = take_french_review(review_text)
        except:
            review_text = ""
        rev_dict['Review Rate'].append(review_rate)
        rev_dict['Review Time Amount'].append(review_time_amount)
        rev_dict['Review Time Period'].append(review_time_period)
        rev_dict['Review Text'].append(review_text)
    return(pd.DataFrame(rev_dict))

In [None]:
# Parameters

url = 'https://www.google.com/search?q=tucana+madrid&rlz=1C5CHFA_enFR941FR941&oq=tucana&aqs=chrome.0.0i355i512j46i175i199i512j69i57j0i512l5j46i512j0i512.1709j0j7&sourceid=chrome&ie=UTF-8#lrd=0xd4229cd32d0921f:0x737ebabee9e57197,1,,,'


In [127]:
path = '/Users/manuel/Documents/GitHub/portfolio/Reviews project/chromedriver'
driver = webdriver.Chrome(service=Service(path))


driver.get(url)
driver.find_element(By.XPATH,'/html/body/div[3]/div[3]/span/div/div/div/div[3]/div[1]/button[1]').click()
time.sleep(12)

nb_reviews = driver.find_element(By.XPATH, '/html/body/span[2]/g-lightbox/div/div[2]/div[3]/span/div/div/div/div[1]/div[3]/div[1]/div/span/span').text
nb_reviews = int(nb_reviews.split(" ")[0])

scrollable_div = driver.find_element(By.XPATH,'/html/body/span[2]/g-lightbox/div/div[2]/div[3]/span/div/div/div/div[2]')
    
for i in range(find_nb_scroll(nb_reviews)):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', 
                scrollable_div)
        time.sleep(3)
        
response = BeautifulSoup(driver.page_source, 'html.parser')

reviews = response.find_all('div', class_ = 'WMbnJf vY6njf gws-localreviews__google-review')

review_summary = get_review_summary(reviews)
review_summary.head()

In [187]:
len(review_summary)

121

In [188]:
review_summary['Review Time Period'] = review_summary['Review Time Period'].map({'semaine': 7, 'semaines': 7, 'mois': 30, 'jour': 1, 'jours':1, 'heures': 0, 'heure': 0, 'ans': 365, 'an': 365})
review_summary['Estimated Review Date'] = review_summary['Review Time Amount'] * review_summary['Review Time Period']

today = datetime.today().strftime('%Y-%m-%d')
review_summary['Estimated Review Date'] = review_summary['Estimated Review Date'].apply(lambda x: pd.to_datetime(today) - timedelta(days=x))

review_summary = review_summary[['Estimated Review Date', 'Review Rate', 'Review Text']]

review_summary.head()

Unnamed: 0,Estimated Review Date,Review Rate,Review Text
0,2022-05-26,5,Vraiment une bonne trouvaille.Très bonne ambia...
1,2022-10-16,4,"Bon restaurant à Madrid, avec une cuisine fusi..."
2,2022-07-25,4,Site recommandé. Je ne lui donne pas 5 étoiles...
3,2022-05-26,5,"Excellent dîner, bel endroit, ambiance avec mu..."
4,2022-10-02,3,"La nourriture était très bonne, mais le servic..."


In [189]:
review_summary.isna().sum()

Estimated Review Date    0
Review Rate              0
Review Text              0
dtype: int64