In [1]:
# IImport required modules
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor

In [2]:
# This functiion return all the cover page link(urls) from where we will scrape review information
def generateCoverPage(url):
    """url = url of the company, we link to scrape reviews for
    return = all the cover page links"""
    
    # To store cover page links
    coverPage = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Extract total reviiews
    totalReviews = s.findAll("span", class_="TextBody TextBody--sm TextBody--inline")[-1].find("strong")\
    .text.replace("Reviews", "").replace(",", "").strip()
    
    # Extract total page
    totalPage = int(np.ceil(int(totalReviews)/20))
    
    # Create all the cover pages link
    for pg in range(1, totalPage):
        coverPage.append(f"{url}/{pg}")
        
    # Append the requested url at the start
    coverPage = [url] + coverPage
    return coverPage

In [3]:
# This function scrapes review, rating, reviewer, and review date
def scrapeReviewInfo(url):
    """url = cover page urls
    return = review info such as review, rating, reviewer, and review date as a dataframe"""
    
    # Initialize empty list of variables to be scraped
    review = []
    rating = []
    reviewer = []
    reviewDate = []
    
    
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # This is the main container for each review
    try:
        mainCont = s.findAll("div", class_="Review")
    except:
        pass
    
    
    # Extract review
    try:
        for cont in mainCont:
            for cont1 in cont.findAll(class_="Review__body"):
                review.append(cont1.text.strip())
    except:
        review.append("na")
    
    # Extract reviewer
    try:
        for cont in mainCont:
            for cont1 in cont.findAll(class_="Review__author"):
                reviewer.append(cont1.text.strip())
    except:
        reviewer.append("na")
    
    # Extract rating
    try:
        for cont in mainCont:
            for cont1 in cont.findAll("div", class_="Review__overallStars__stars"):
                rating.append(cont1.findAll("i", class_="stars__icon icon-full-star-01"))
    except:
        rating.append("na")
    
    # Extract review data
    try:
        for cont in mainCont:
            for cont1 in cont.findAll(class_="Review__dateSource"):
                reviewDate.append(cont1.text.strip())
    except:
        reviewDate.append("na")
    
    # Create a df off scraped variables
    df = pd.DataFrame({
        "review":review,
        "reviewer":reviewer,
        "rating":rating,
        "reviewDate":reviewDate
    })
    
    # Extract rating in number from rating
    df.rating = df.rating.str.len().astype("int")
    return df

In [4]:
# Wrap all the functions inside main
def main(url):
    coverPage = generateCoverPage(url)
    with ProcessPoolExecutor(max_workers=4) as ex:
        finalDf = pd.concat(list(ex.map(scrapeReviewInfo, coverPage))).reset_index(drop=True)
    return finalDf

In [5]:
%%time
# Scrape one education review
df = main("https://www.reviews.io/company-reviews/store/one-education")
df.shape

CPU times: user 203 ms, sys: 34.7 ms, total: 237 ms
Wall time: 4.28 s


(674, 4)

In [6]:
# Let's preview the data
df.head(10)

Unnamed: 0,review,reviewer,rating,reviewDate
0,“This was a fantastic course packed with amazi...,Gail Walton,4,Posted 16 hours ago
1,“very helpful and easy to understand”,Fatimah,5,Posted 21 hours ago
2,“Great content. Good instructions.”,Anonymous,4,Posted 1 day ago
3,“Great course.”,Anonymous,5,Posted 1 day ago
4,"“Informative learning materials on this site, ...",Victoria Lewis,5,Posted 2 days ago
5,“How come i can’t get to do the exam?”,Gart Fletcher,2,Posted 2 days ago
6,"“Found this on wowcher for a reduced price, th...",Anonymous,4,Posted 3 days ago
7,"“Awesome!! Best course ever online, learning w...",Georgina Patterson,5,Posted 3 days ago
8,“I must tell you this course really help ne a ...,Itschok,5,Posted 3 days ago
9,"“Gary - DSEAR - Excellent information clear, i...",Gary Broadhurst,5,Posted 4 days ago
