In [1]:
# Import required modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from IPython.display import clear_output
import itertools
from datetime import datetime
today = datetime.today().date().strftime("%d_%b")

In [2]:
def generateCoverPageLink(url):
    """Scrapes broad category link"""
    
    coverPageLink = []
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    for lnk in s.find_all("div", class_="content-box category-info-box info-box"):
        coverPageLink.append(lnk.a.get("href"))
    return coverPageLink

def scrapeIndCourseLinkAndCategory(url):
    """Scrapes individual course links and extracts category from cover page link."""
    
    # Scrape courseLink and category
    courseLink = []
    category = []
    r= requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    for lnk in s.find_all("a", class_="block button button-blue"):
        courseLink.append(lnk.get("href"))
        category.append(url.split("/")[-1])
        
    # Create a df off scraped variables
    df = pd.DataFrame({
        "courseLink":courseLink,
        "category":category
    })
    
    return df


def scrapeCourseInfo(url):
    """Scrapes course info."""
    
    # Variables to scrape
    courseTitle = []
    category = []
    price = []
    enrollments = []
    teacherName = []
    designationOrInstitute = []
    courseLink = []
    
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"}
    r = requests.get(url, headers=headers)
    s = BeautifulSoup(r.text, "lxml")
    
    # Get course link
    courseLink.append(url)
    
    # Course title
    try:
        courseTitle.append(s.find("h1").text)
    except:
        courseTitle.append("na")
    
    # Course price
    try:
        price.append(s.find("div", class_="price").find_all("div")[-1].text)
    except:
        price.append("na")
    
    # Enrollments
    try:
        enrollments.append(s.find_all("span")[2].text)
    except:
        enrollments.append(0)
        
    try:
        if "Course Teacher" in s.find_all("span")[7].text.strip():
            teacherName.append(s.find_all("p")[3].text)
        else:
            teacherNamecher.append("na")
    except:
        teacherName.append("na")
        
    try:
        if "Course Teacher" in s.find_all("span")[7].text.strip():
            designationOrInstitute.append(s.find_all("p")[4].text)
        else:
            designationOrInstitute.append("na")
    except:
        designationOrInstitute.append("na")
    
    # See the progress
    print(f"{url}")
    clear_output(wait=True)
    
    # Create a df with the scraped variable
    df = pd.DataFrame({
    "courseTitle":courseTitle,
    "courseLink":courseLink,
    "enrollments":enrollments,
    "price":price,
    "teacherName":teacherName,
    "designationOrInstitute":designationOrInstitute
    })
    
    return df

In [3]:
# Wrap all the function inside main
def main(url, s1, s2):
    """Use threading. s1 = start index, s2 = end index."""
    
    # Extract cover page links
    coverPageLink = generateCoverPageLink(url)
    
    # Scrapes course link and category
    with ThreadPoolExecutor() as executor:
        # This returns a df of course link and category instead a list of course link
        courseLinkAndCategory = pd.concat(list(executor.map(scrapeIndCourseLinkAndCategory, coverPageLink)))
    
    # Scrape course info 
    with ThreadPoolExecutor() as executor:
        df = pd.concat(list(executor.map(scrapeCourseInfo,
                                         courseLinkAndCategory.courseLink.iloc[s1:s2])))
    # Insert category by merging two dfs on course links  
    mergedCat = pd.merge(df, courseLinkAndCategory, on="courseLink", how="left")
    
    # Drop rows whose url don't contain "ofcourse"
    mergedCat = mergedCat[mergedCat.courseLink.str.contains("ofcourse", case=False)].reset_index(drop=True)
    return mergedCat

In [4]:
%%time
# Scrape in chunks
df1 = main("https://www.ofcourse.co.uk/", 0, 200)

CPU times: user 20.6 s, sys: 1.21 s, total: 21.8 s
Wall time: 1min 55s


In [5]:
%%time
df2 = main("https://www.ofcourse.co.uk/", 200, 400)

CPU times: user 21 s, sys: 1.05 s, total: 22 s
Wall time: 2min 23s


In [6]:
%%time
df3 = main("https://www.ofcourse.co.uk/", 400, 600)

CPU times: user 21.4 s, sys: 1.35 s, total: 22.7 s
Wall time: 1min 51s


In [7]:
%%time
df4 = main("https://www.ofcourse.co.uk/", 600, None)

CPU times: user 17.5 s, sys: 1.03 s, total: 18.5 s
Wall time: 1min 10s


In [8]:
# Concat all the dfs
masterDf = pd.concat([
    df1,
    df2,
    df3,
    df4
]).reset_index(drop=True)

# Preview what we have
masterDf.head(10)

Unnamed: 0,courseTitle,courseLink,enrollments,price,teacherName,designationOrInstitute,category
0,Start Your Own Counselling Business,https://www.ofcourse.co.uk/course/start-your-o...,268,£100,Ira Israel,A Licensed Counsellor & Psychotherapist (LPCC ...,start-your-own
1,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,start-your-own
2,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,therapies
3,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,health-fitness
4,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,creativity
5,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,career-success
6,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,entrepreneurial
7,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,spiritual
8,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,professional-development
9,CTAA Accredited Crystal Therapy and Healing,https://www.ofcourse.co.uk/course/ctaa-accredi...,680,£100,Karen E. Wells,KEW Training Academy Ltd,alternative-health


In [9]:
# Create a csv file
masterDf.to_csv(f"ofCourseInstructors_{today}.csv", index=None)