In [1]:
# Import required modules
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from concurrent.futures import ProcessPoolExecutor
from itertools import chain

In [2]:
def generateCoverPageLink(url):
    coverPageLink = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    totalReview = int(s.find("span", class_="headline__review-count").text.replace(",", ""))
    totalPage = int(np.ceil(totalReview/20))
    for page in range(1, 2250):
        coverPageLink.append(f"{url}?page={page}")
    return coverPageLink


def scrapeReviewLink(url):
    reviewLink = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    allReviewLink = s.find_all(class_="link link--large link--dark")
    for lnk in allReviewLink:
        reviewLink.append("https://www.trustpilot.com" + lnk.get("href"))
    return reviewLink


def scrapeReview(url):
    reviewer = []
    reviewTitle = []
    reviewBody = []
    rating = []
    
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    try:
        reviewer.append(s.find(class_="consumer-information__name").text.strip())
    except:
        reviewer.append("na")
        
    try:
        reviewTitle.append(s.find("div", class_="review-content__body").text.strip())
    except:
        reviewTitle.append("na")
    
    try:
        reviewBody.append(s.find(class_="review-content__text").text.strip())
    except:
        reviewBody.append("na")
    
    try:
        rating.append(s.find(class_="star-rating star-rating--medium").img.get("alt"))
    except:
        rating.append("na")
    
    df = pd.DataFrame({
        "reviewer":reviewer,
        "reviewTitle":reviewTitle,
        "reviewBody":reviewBody,
        "rating":rating
    })
    
    return df

In [3]:
# Create cover page link
coverPageLink = generateCoverPageLink("https://www.trustpilot.com/review/shawacademy.com")

In [5]:
%%time
# Scrape review page links
with ProcessPoolExecutor(max_workers=3) as ex:
    reviewLink = list(ex.map(scrapeReviewLink, coverPageLink))
    reviewLink = list(chain(*reviewLink))

CPU times: user 1.78 s, sys: 327 ms, total: 2.11 s
Wall time: 5min 29s


In [6]:
%%time
# Scrape review info
with ProcessPoolExecutor(max_workers=3) as ex:
    reviewDf = pd.concat(list(ex.map(scrapeReview, reviewLink)))

CPU times: user 16.4 s, sys: 1.04 s, total: 17.4 s
Wall time: 12min 59s


In [7]:
# Let's have a look at the data
reviewDf.head()

Unnamed: 0,reviewer,reviewTitle,reviewBody,rating
0,ziyanda Mabuza,Awful customer service\n \n\n\n...,Awful customer service! Registered for the fre...,1 star: Bad
0,YOGESH M,This is really a good platform but if…\n ...,This is really a good platform but if you sent...,5 stars: Excellent
0,Beatrice Alcantara,Never ever deal with this company\n ...,Never ever deal with this company. So hard to...,1 star: Bad
0,na,na,na,na
0,Anna van Thiel,Illegal write offs\n \n\n\n ...,I clicked on a link in their email to see an o...,1 star: Bad


In [8]:
# Create a copy of the df
cleanedDf = reviewDf.copy()

# Drop rows with review body na
cleanedDf = cleanedDf[cleanedDf.reviewBody!="na"]

# clean rating
cleanedDf.rating = cleanedDf.rating.str.extract(r"(\d+)").astype("int")
cleanedDf.tail(10)

Unnamed: 0,reviewer,reviewTitle,reviewBody,rating
0,ziyanda Mabuza,Awful customer service\n \n\n\n...,Awful customer service! Registered for the fre...,1
0,YOGESH M,This is really a good platform but if…\n ...,This is really a good platform but if you sent...,5
0,Beatrice Alcantara,Never ever deal with this company\n ...,Never ever deal with this company. So hard to...,1
0,Anna van Thiel,Illegal write offs\n \n\n\n ...,I clicked on a link in their email to see an o...,1
0,Энхмэнд-ОД Өлзийбат,Beware from them\n \n\n\n ...,they call keep calling my number. ShawAcademy....,1
0,Earyn Merrick,I just started but it seems like a very…\n ...,I just started but it seems like a very knowle...,4
0,Apple Trixia Delfinado,I'm learning a lot and I can really…\n ...,I'm learning a lot and I can really apply it t...,5


In [9]:
# Save the scraped data
cleanedDf.to_excel("shawAcademyReviewOnTrustPilot.xlsx", index=None)