In [3]:
#from bs4 import BeautifulSoup as bs
from lxml import html
import requests
import json
import pandas as pd
from unidecode import unidecode

In [4]:
def clean_xpath_res(l):
    return unidecode("\n".join(l).strip())

class TrustPilotScraper:
    def __init__(self, url):
        self.url = url
    
    def _parse_review(self, review_block):
        info = dict()
        xpath_rating = ".//div[@class='star-rating star-rating--medium']/img/@alt"
        info["rating_star"] = clean_xpath_res(review_block.xpath(xpath_rating))
        xpath_date = ".//div[@class='review-content-header__dates']/script/text()"
        info["date"] = json.loads(review_block.xpath(xpath_date)[0])["publishedDate"]
        xpath_title = ".//h2[@class='review-content__title']//text()"
        info["title"] = clean_xpath_res(review_block.xpath(xpath_title))
        xpath_review = ".//p[@class='review-content__text']//text()"
        info["review"] = clean_xpath_res(review_block.xpath(xpath_review))
        xpath_n_reviews = ".//div[@class='consumer-information__review-count']//text()"
        info["n_reviews_customer"] = clean_xpath_res(review_block.xpath(xpath_n_reviews))
        return info
    
    def _parse_page(self, page):
        info = list()
        reviews = page.xpath("//div[@class='review-card  ']")
        for r in reviews:
            info.append(self._parse_review(r))
        return info
    
    def _url_page(self, n_page):
        return f"{self.url}?page={n_page}"
    
    def _get_html(self, url):
        page_content = requests.get(url).content
        page_html = html.fromstring(page_content)
        return page_html
    
    def get_number_reviews(self):
        page_html = self._get_html(self.url)
        xpath_n_reviews = xpath_n_reviews = "//span[@class='headline__review-count']//text()"
        n_reviews_str = clean_xpath_res(page_html.xpath(xpath_n_reviews))
        n_reviews = int(n_reviews_str.replace(" ", ""))
        return n_reviews
    
    def scrap_reviews(self):
        info = []
        n_reviews = self.get_number_reviews()
        n_pages = n_reviews // 20 + 1 # 20 reviews per page
        # Not optimised, it's the max # of pages, but often less
        for n in range(1, n_pages):
            url = self._url_page(n)
            page_html = self._get_html(url)
            page_info = self._parse_page(page_html)
            for i in page_info:
                i["page"] = n
            info += page_info
            print(f"Done with {url}")
        return info
    
    
def scrap_reviews(url, filename_save=""):
    tps = TrustPilotScraper(url)
    info = tps.scrap_reviews()
    df = pd.DataFrame(info)
    df = df.drop_duplicates(["rating_star", "date", "title", "review"])
    if filename_save:
        df.to_csv(filename_save, encoding="utf-8")
    return df

In [8]:
url = "https://fr.trustpilot.com/review/www.aroma-zone.com"
filename = "intermediary_outputs/aromazone_trustpilot_v3.csv"
df = scrap_reviews(url, filename)