In [1]:
# Import required modules
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# For headless mode
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1080")  

In [2]:
# This function scrapes all the info in a review
def scrapeReview(url):
    """
    url = course url for which we will scrape review
    returns = all the review info"""
    
    # Initialize webdrive in headless mode
    driver = webdriver.Chrome(r"/home/faysal/Documents/chromedriver", options=options)
    driver.get(url)
    time.sleep(4)
    
    # Close accept cookies
    try:
        driver.find_element_by_id("onetrust-accept-btn-handler").click()
    except:
        pass

    # Locate view more button by link text
    try:
        viewMore = driver.find_element_by_link_text("View more")
    except:
        pass

    # Keep clicking till "View more" exists, otherwise just break out of the loop
    while True:
        try:
            viewMore.click()
            time.sleep(2)
        except:
            break
    
    # This main container holds all the review info
    mainCont = driver.find_elements_by_css_selector("div#providerCourseReviews article")
    # Extract main cont text
    mainCont = [x.text for x in mainCont]
    
    # Create a df off main cont
    df = pd.DataFrame({"mainCont":mainCont})
    
    # Insert the course link
    df["courseLink"] = url
    driver.close()
    return df

# Extract required info from main cont
def clean(df):
    """return = final cleaned df"""
    df = df.copy()
    df["courseId"] = df.courseLink.str.split("/").str[5].str.replace("#", "").str.strip()
    df["review"] = df.mainCont.str.split("\n").str[3].str.strip()
    df["rating"] = df.mainCont.str.split("Star").str[0].str.strip()
    df["reviewDate"] = df.mainCont.str.split("\n").str[1].str.strip()
    df["reviewedBy"] = df.mainCont.str.split("Review by").str[1].str.split("for").str[0].str.strip()
    return df

In [3]:
# This is to scrape by chunks
def main(s1, s2, fileName):
    """
    s1 = start index of the link
    s2 = end index of the link
    fileName = file name from which we read the link"""
    
    # Read the course link from a file to send requests to
    courseLinkDf = pd.read_excel(f"{fileName}.xlsx")
    
    # Scrape by chunks
    df = pd.concat(list(map(scrapeReview, courseLinkDf.courseLink.iloc[s1:s2])))
    
    # Extract info from main cont
    finalDf = clean(df)
    
    # Drop unnecessary columns
    finalDf.drop(["mainCont", "courseLink"], axis=1, inplace=True)
    return finalDf

In [4]:
%%time
chunk1 = main(0, 30, "excelWithBusiness")

CPU times: user 790 ms, sys: 242 ms, total: 1.03 s
Wall time: 6min 17s


In [5]:
%%time
chunk2 = main(30, None, "excelWithBusiness")

CPU times: user 807 ms, sys: 212 ms, total: 1.02 s
Wall time: 6min 12s


In [6]:
# Concat all the chunks
allReview = pd.concat([chunk1, chunk2]).reset_index(drop=True)
allReview.courseId = allReview.courseId.astype("int")

# Merge review data with course data
courseData = pd.read_excel("excelWithBusiness.xlsx")
courseWithReview = pd.merge(courseData, allReview, on="courseId", how="left")

# Let's have a look at our final data
courseWithReview.head()

Unnamed: 0,date,courseId,courseTitle,courseLink,subtitle,courseProvider,offerPrice,originalPrice,unitSold,category,...,soldOrEnq,savingsPercent,broadCategory1,broadCategory2,subCategory1,subCategory2,review,rating,reviewDate,reviewedBy
0,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,My partner recently bought this package for me...,5.0,16 Nov 2020,Paul
1,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,I bought the course as I have been out of work...,5.0,13 Nov 2020,Linda Booth
2,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,Course narrative is very detailed and the cont...,4.6,21 Oct 2020,Cruz Afonso
3,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,Course is great and easy to use. You can go th...,5.0,8 Oct 2020,Mark Eames
4,02_Dec,74821,Black Friday Special - The Ultimate Microsoft ...,https://www.reed.co.uk/courses/black-friday-sp...,This is the First time we have ever offered Li...,Excel with Business,39.99,199,5217,"[['IT', 'Business analysis'], ['Office skills'...",...,1,79,IT,"Admin, secretarial & PA",Business analysis,Microsoft Excel,"This course is excellent, it provides really g...",5.0,7 Oct 2020,Gregory Houlders


In [7]:
# Who got the best avg stars?
courseWithReview.rating = courseWithReview.rating.fillna(0).astype(float)
courseWithReview.groupby("courseTitle").rating.agg(["count", "mean"]).sort_values("mean", ascending=False).round(2)

Unnamed: 0_level_0,count,mean
courseTitle,Unnamed: 1_level_1,Unnamed: 2_level_1
Introduction To Leadership & Management,1,5.0
Microsoft Office Essentials - 3 Course Bundle,1,5.0
Advanced Microsoft Excel Course (CPD Accredited),2,5.0
Black Friday Special - The Online Business Mini MBA Bundle Lifetime,1,5.0
Critical Thinking & Problem Solving Skills Course,1,5.0
...,...,...
Data Science Online Course,1,0.0
"Communication, Influence & Teamwork Skills Course",1,0.0
Microsoft OneNote Online Course,1,0.0
Business Analysis Online Course,1,0.0
