In [62]:
from datetime import datetime, timedelta

import pandas as pd

import re

import math 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

import time

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer #Tokenizer that takes only alphanumerical characters (no punctuation)
from nltk.util import ngrams

from bs4 import BeautifulSoup
import requests

import spacy
nlp = spacy.load("fr_core_news_sm")

from collections import Counter

import time

In [75]:
def take_french_review(review_text_object):
    """Takes a text review object as input
    Returns the French version of the review only"""
    if review_text_object[:8] == "(Traduit":
        return review_text_object.replace("(Traduit par Google) ", "").replace("\xa0", "").split("(Avis d'origine)")[0]
    else:
        return review_text_object.replace("(Traduit par Google) ", "").replace("\xa0", "")
    
    
def find_nb_scroll(N):
    """Takes N an int, number of reviews that was left
    Returns number of scrolls needed to load all reviews"""
    if N%10 == 0:
        return int(N/10 - 1)
    else:
        return math.floor(N/10)
    
    
def get_review_summary(result_set):
    """Input: Text object, raw output of HTML data of the page
    Output: Pandas dataframe with one line per review with rate, time and comment"""
    rev_dict = {
        'Review Rate': [],
        'Review Time Amount' : [],
        'Review Time Period' : [],
        'Review Text' : []
                }
    for result in result_set:
        try:
            review_rate = int(result.find('span', class_='Fam1ne EBe2gf')["aria-label"][7])
        except:
            review_rate = ""
        try:
            review_time_amount = int(result.find('span',class_='dehysf lTi8oc').text[7:].replace('\xa0', ' ').replace('une', '1').replace('un', '1').split()[0])
        except:
            review_time_amount = ""
        try:
            review_time_period = result.find('span',class_='dehysf lTi8oc').text[7:].replace('\xa0', ' ').replace('une', '1').split()[1]
        except:
            review_time_period = ""
        try:
            review_text = result.find('span',class_='review-full-text').text
            review_text = take_french_review(review_text)
        except:
            review_text = ""
        rev_dict['Review Rate'].append(review_rate)
        rev_dict['Review Time Amount'].append(review_time_amount)
        rev_dict['Review Time Period'].append(review_time_period)
        rev_dict['Review Text'].append(review_text)
    return(pd.DataFrame(rev_dict))


# Function to concat strings
def concat_strings(df):
    sentence = ''
    list_of_sentences = df["Review Text"].tolist()
    for string in list_of_sentences:
        sentence = sentence + string
    return sentence


# Function to lemmatize
def lemmatize(sentence):
    doc = nlp(sentence)
    return [X.lemma_ for X in doc]


# Produce a summary of reviews rates
def review_rate_summary(df):
    summary = pd.DataFrame(df['Review Rate'].value_counts()).reset_index()
    summary = summary.rename(columns={'index': 'grade'})
    summary['pct_grade'] = round(summary['Review Rate']/len(df),2)
    summary = summary.sort_values(by = ['grade'], ascending= False)
    return summary




#Takes a list of tokens and returns POS tagging of tokens
def return_POS_tokens(tok_list):
    adv_list = []
    verb_list = []
    adj_list = []
    noun_list = []
    for token in tok_list:
        category = nlp(token)[0].pos_
        if category == 'ADV':
            adv_list.append(token)
        if category == 'VERB':
            verb_list.append(token)
        if category == 'ADJ':
            adj_list.append(token)
        if category == 'NOUN':
            noun_list.append(token)
        lists_dict = {'adv_list': adv_list,
            'verb_list': verb_list,
            'adj_list': adj_list,
           'noun_list': noun_list}
    return lists_dict


def analyaze_nlp_reviews(df):
    #common words lemmatized
    common_words = ['être', 'avoir']

    input_sentence = concat_strings(df)
    # remove punctuation
    input_sentence = re.sub(r'[^\w\s]', " ", input_sentence)
    #Put to lower
    input_sentence = input_sentence.lower()

    #Return lemmatized tokens
    lemmatized_tokens = lemmatize(input_sentence)

    # Ste list of french stop words
    stop_words = set(stopwords.words('french'))

    #Remove stop words
    output = [w for w in lemmatized_tokens if not w in stop_words]
    #Remove white spaces
    output = [re.sub(r'[\s*]', '', w) for w in output]
    output = [w for w in output if not (w == '' or w in common_words)]
    
    unigram_most_common = Counter(output).most_common()
    unigram_most_common = unigram_most_common[:20]
    bigram_most_common = Counter(list(nltk.bigrams(output))).most_common()
    bigram_most_common = bigram_most_common[:20]
    trigram_most_common = Counter(list(nltk.ngrams(output,3))).most_common()[:20]
    trigram_most_common = trigram_most_common[:20]

    return {'output': output,
        'unigram_most_common': unigram_most_common,
            'bigram_most_common': bigram_most_common,
            'trigram_most_common': trigram_most_common}



# Main function to scrap data and return a dataframe
def load_data(url, limit):
    # Set up chromedriver
    path = '/Users/manuel/Documents/GitHub/portfolio/Reviews project/chromedriver'
    driver = webdriver.Chrome(service=Service(path))

    # Go to reviews URL and click on refuse cookies
    driver.get(url)
    driver.find_element(By.XPATH,'/html/body/div[3]/div[3]/span/div/div/div/div[3]/div[1]/button[1]').click()
    time.sleep(12)

    # Get number of reviews
    nb_reviews = driver.find_element(By.XPATH, '/html/body/span[2]/g-lightbox/div/div[2]/div[3]/span/div/div/div/div[1]/div[3]/div[1]/div/span/span').text
    nb_reviews = int(nb_reviews.split(" ")[0].replace('\u202f', ''))

    # Find scrollable element and scroll to load all reviews
    scrollable_div = driver.find_element(By.XPATH,'/html/body/span[2]/g-lightbox/div/div[2]/div[3]/span/div/div/div/div[2]')

    for i in range(min(find_nb_scroll(nb_reviews),limit)):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', 
                    scrollable_div)
        time.sleep(3)


    # Get the html and find reviews object then apply function to get dataframe with relevant reviews information
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = response.find_all('div', class_ = 'WMbnJf vY6njf gws-localreviews__google-review')
    review_summary = get_review_summary(reviews)


    # Compute the estimated date of review based on date information
    review_summary['Review Time Period'] = review_summary['Review Time Period'].map({'semaine': 7, 'semaines': 7, 'mois': 30, 'jour': 1, 'jours':1, 'heures': 0, 'heure': 0, 'ans': 365, 'an': 365})
    review_summary['Estimated Review Date'] = review_summary['Review Time Amount'] * review_summary['Review Time Period']

    today = datetime.today().strftime('%Y-%m-%d')
    review_summary['Estimated Review Date'] = review_summary['Estimated Review Date'].apply(lambda x: pd.to_datetime(today) - timedelta(days=x))

    review_summary = review_summary[['Estimated Review Date', 'Review Rate', 'Review Text']]

    return review_summary

# Load data

In [136]:
# Parameters
start = time.time()
url = 'https://www.google.com/search?q=bouillon+chartier&rlz=1C5CHFA_enFR941FR941&ei=BLleY82yGNmllwSqgJqAAQ&ved=0ahUKEwjNxI3VwYj7AhXZ0oUKHSqABhAQ4dUDCBA&uact=5&oq=bouillon+chartier&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQAzIKCC4QxwEQrwEQQzIKCC4QxwEQrwEQQzIECAAQQzILCC4QgAQQxwEQrwEyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQ6CggAEEcQ1gQQsAM6BwgAELADEEM6EgguEMcBEK8BEMgDELADEEMYAToVCC4QxwEQrwEQ1AIQyAMQsAMQQxgBOgwILhDIAxCwAxBDGAE6BQguEIAESgQIQRgASgQIRhgBUMkDWIsKYI0LaAFwAXgAgAGTAYgBgweSAQMwLjeYAQCgAQHIARTAAQHaAQYIARABGAg&sclient=gws-wiz-serp#lrd=0x47e66e3e7dad1be3:0xf5848e4836f69968,1,,,'
data = load_data(url, limit= 100)
# Limit is the max number of scroll that will be performed (1 scroll = 10 reviews)
end = time.time()

# print the difference between start
# and end time in milli. secs
data.head()

Unnamed: 0,Estimated Review Date,Review Rate,Review Text
0,2022-06-02,4,Expérience à vivre ! Après plusieurs tentative...
1,2022-02-02,5,
2,2022-08-31,5,On ne fait plus leur renommée. Pas de réservat...
3,2022-09-30,3,
4,2022-08-01,4,Bon resto pour manger des plats français simpl...


In [137]:
print("The time of execution of above program is :",
      (end-start), "s")

The time of execution of above program is : 321.19885897636414 s


# Summary

In [141]:
rate_summary = review_rate_summary(data)
rate_summary.head()

Unnamed: 0,grade,Review Rate,pct_grade
0,5,65,0.54
1,4,35,0.29
2,3,13,0.11
3,2,4,0.03
4,1,3,0.02


In [142]:
# Number of reviews
len(data)

120

In [140]:
data.isna().sum()

Estimated Review Date    0
Review Rate              0
Review Text              0
dtype: int64

# Overall analysis

## Most recurrent tokens

In [143]:
nlp_analysis = analyaze_nlp_reviews(data)
nlp_analysis['unigram_most_common']

[('très', 74),
 ('bon', 45),
 ('prix', 44),
 ('serveur', 42),
 ('table', 37),
 ('plat', 34),
 ('faire', 32),
 ('tout', 32),
 ('cela', 28),
 ('attente', 27),
 ('peu', 26),
 ('manger', 22),
 ('plus', 22),
 ('si', 22),
 ('qualité', 21),
 ('service', 21),
 ('pouvoir', 20),
 ('2', 20),
 ('vraiment', 19),
 ('bien', 18)]

## Most recurrent bi-grams

In [121]:
nlp_analysis['bigram_most_common']

[(('service', 'client'), 4),
 (('file', 'attente'), 3),
 (('chez', 'jour'), 3),
 (('salade', 'vouloir'), 2),
 (('intoxication', 'alimentaire'), 2),
 (('alimentaire', 'non'), 2),
 (('faire', 'queue'), 2),
 (('passer', 'commande'), 2),
 (('plus', 'rien'), 2),
 (('maïs', 'carotte'), 2),
 (('salade', '12'), 2),
 (('fruit', 'rouge'), 2),
 (('savoir', 'si'), 2),
 (('tant', 'pis'), 2),
 (('ingredient', 'frais'), 2),
 (('super', 'personnel'), 2),
 (('grand', 'amoureux'), 1),
 (('amoureux', 'chaîne'), 1),
 (('chaîne', 'pouvoir'), 1),
 (('pouvoir', 'dire'), 1)]

## Most recurrent trigrams

In [122]:
nlp_analysis['trigram_most_common']

[(('intoxication', 'alimentaire', 'non'), 2),
 (('grand', 'amoureux', 'chaîne'), 1),
 (('amoureux', 'chaîne', 'pouvoir'), 1),
 (('chaîne', 'pouvoir', 'dire'), 1),
 (('pouvoir', 'dire', 'dernier'), 1),
 (('dire', 'dernier', 'fois'), 1),
 (('dernier', 'fois', 'mets'), 1),
 (('fois', 'mets', 'pied'), 1),
 (('mets', 'pied', 'encore'), 1),
 (('pied', 'encore', 'fois'), 1),
 (('encore', 'fois', 'pari'), 1),
 (('fois', 'pari', 'personnel'), 1),
 (('pari', 'personnel', 'envier'), 1),
 (('personnel', 'envier', 'travailler'), 1),
 (('envier', 'travailler', 'tomber'), 1),
 (('travailler', 'tomber', 'personne'), 1),
 (('tomber', 'personne', 'faire'), 1),
 (('personne', 'faire', 'salade'), 1),
 (('faire', 'salade', 'vouloir'), 1),
 (('salade', 'vouloir', 'absolument'), 1)]

### POS Tagging

In [123]:
lists_by_word_category = return_POS_tokens(nlp_analysis['output'])

In [124]:
Counter(lists_by_word_category['adv_list']).most_common()[:10]

[('plus', 14),
 ('très', 8),
 ('beaucoup', 8),
 ('bien', 7),
 ('client', 7),
 ('vraiment', 5),
 ('non', 4),
 ('donc', 4),
 ('peu', 4),
 ('encore', 3)]

In [125]:
Counter(lists_by_word_category['verb_list']).most_common()[:10]

[('pouvoir', 8),
 ('payer', 8),
 ('ingrédient', 7),
 ('passer', 7),
 ('commande', 7),
 ('manger', 6),
 ('vouloir', 5),
 ('prendre', 5),
 ('restaurant', 5),
 ('déjeuner', 5)]

In [126]:
Counter(lists_by_word_category['adj_list']).most_common()[:10]

[('salade', 14),
 ('bon', 8),
 ('bureau', 3),
 ('sauce', 3),
 ('poulet', 3),
 ('file', 3),
 ('premier', 3),
 ('chaîne', 2),
 ('dernier', 2),
 ('veux', 2)]

In [127]:
Counter(lists_by_word_category['noun_list']).most_common()[:10]

[('jour', 8),
 ('frais', 7),
 ('service', 7),
 ('personnel', 6),
 ('jus', 5),
 ('fois', 4),
 ('boisson', 4),
 ('produit', 4),
 ('repas', 3),
 ('qualité', 3)]

# Bad reviews analysis

In [144]:
bad_reviews = data[data['Review Rate'] <= 2]

bad_reviews_analysis = analyaze_nlp_reviews(bad_reviews)
bad_reviews_analysis['unigram_most_common']

[('table', 10),
 ('si', 5),
 ('manger', 4),
 ('tout', 4),
 ('repas', 3),
 ('sec', 3),
 ('serveur', 3),
 ('cadre', 3),
 ('sans', 3),
 ('cela', 3),
 ('où', 3),
 ('prix', 3),
 ('très', 3),
 ('deux', 3),
 ('20min', 2),
 ('bouteille', 2),
 ('presque', 2),
 ('remplir', 2),
 ('mayonnaise', 2),
 ('fin', 2)]

In [145]:
bad_reviews_analysis['bigram_most_common']

[(('table', 'deux'), 2),
 (('table', 'quatre'), 2),
 (('experience', 'horrible'), 1),
 (('horrible', '20min'), 1),
 (('20min', 'ouvrir'), 1),
 (('ouvrir', 'bouteille'), 1),
 (('bouteille', 'rosé'), 1),
 (('rosé', 'temps'), 1),
 (('temps', 'manger'), 1),
 (('manger', 'entrée'), 1),
 (('entrée', 'presque'), 1),
 (('presque', 'finir'), 1),
 (('finir', 'repas'), 1),
 (('repas', 'repas'), 1),
 (('repas', 'pâte'), 1),
 (('pâte', 'légume'), 1),
 (('légume', 'remplir'), 1),
 (('remplir', 'huile'), 1),
 (('huile', 'demander'), 1),
 (('demander', 'mayonnaise'), 1)]

In [146]:
bad_reviews_analysis['trigram_most_common']

[(('experience', 'horrible', '20min'), 1),
 (('horrible', '20min', 'ouvrir'), 1),
 (('20min', 'ouvrir', 'bouteille'), 1),
 (('ouvrir', 'bouteille', 'rosé'), 1),
 (('bouteille', 'rosé', 'temps'), 1),
 (('rosé', 'temps', 'manger'), 1),
 (('temps', 'manger', 'entrée'), 1),
 (('manger', 'entrée', 'presque'), 1),
 (('entrée', 'presque', 'finir'), 1),
 (('presque', 'finir', 'repas'), 1),
 (('finir', 'repas', 'repas'), 1),
 (('repas', 'repas', 'pâte'), 1),
 (('repas', 'pâte', 'légume'), 1),
 (('pâte', 'légume', 'remplir'), 1),
 (('légume', 'remplir', 'huile'), 1),
 (('remplir', 'huile', 'demander'), 1),
 (('huile', 'demander', 'mayonnaise'), 1),
 (('demander', 'mayonnaise', 'grenaille'), 1),
 (('mayonnaise', 'grenaille', 'mayonnaise'), 1),
 (('grenaille', 'mayonnaise', 'déjà'), 1)]

In [147]:
bad_lists_by_word_category = return_POS_tokens(bad_reviews_analysis['output'])

In [148]:
Counter(bad_lists_by_word_category['adv_list']).most_common()[:10]

[('tout', 4),
 ('très', 3),
 ('presque', 2),
 ('client', 2),
 ('déjà', 1),
 ('vite', 1),
 ('correctement', 1),
 ('alors', 1),
 ('allaient', 1),
 ('bouger', 1)]

In [149]:
Counter(bad_lists_by_word_category['adj_list']).most_common()[:10]

[('sec', 3),
 ('petit', 2),
 ('cher', 2),
 ('horrible', 1),
 ('grenaille', 1),
 ('servir', 1),
 ('sanitaire', 1),
 ('nouveau', 1),
 ('serveuse', 1),
 ('aimable', 1)]

In [150]:
Counter(bad_lists_by_word_category['verb_list']).most_common()[:10]

[('manger', 4),
 ('cadre', 3),
 ('remplir', 2),
 ('inconnu', 2),
 ('passer', 2),
 ('partager', 2),
 ('ouvrir', 1),
 ('rosé', 1),
 ('finir', 1),
 ('demander', 1)]

In [135]:
Counter(bad_lists_by_word_category['noun_list']).most_common()[:10]

[('jour', 6),
 ('service', 5),
 ('boisson', 4),
 ('fois', 3),
 ('qualité', 3),
 ('date', 3),
 ('constat', 3),
 ('personnel', 2),
 ('problème', 2),
 ('faim', 2)]