In [1]:
# Import required modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

In [2]:
# This scrapes all the category links to send requests
def scrapeCategoryLink(url):
    
    # To store category links
    catLink = []
    
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    for cat in s.findAll("li", class_="sub-nav-cat")[1:]:
        catLink.append(url + cat.find("a").get("href"))
    return catLink


# This scrapes individual course info from every category link
def scrapeCourseInfo(url):
    
    # Initialize empty list of variables to be scraped
    courseTitle = []
    courseLink = []
    duration = []
    offerPrice = []
    description = []
    
    # Make requests
    r = requests.get(url)
    s = BeautifulSoup(r.text, "lxml")
    
    # Main container for every course
    mainCont = s.findAll("div", class_="course")
    
    
    # Extract title
    for tlt in mainCont:
        try:
            courseTitle.append(tlt.find(class_="product__text-title").text.strip())
        except:
            courseTitle.append("na")
    
    # Extract course link
    for lnk in mainCont:
        try:
            courseLink.append("https://www.highspeedtraining.co.uk"\
                              + lnk.find(class_="product__text-title").find("a").get("href").replace("..", ""))
        except:
            courseLink.append("na")
    
    # Extract offer price
    for prc in mainCont:
        try:
            offerPrice.append(lnk.find(class_="product__text-price").text.strip())
        except:
            offerPrice.append("na")
    
    # Extract duration
    for dur in mainCont:
        try:
            duration.append(dur.find(class_="product__text-duration").text.strip())
        except:
            duration.append("na")
    
    # Extract description
    for des in mainCont:
        try:
            description.append(des.find(class_="product__text-description").text.strip())
        except:
            description.append("na")
            
    # Create a df off scraped variables
    df = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink":courseLink,
        "offerPrice":offerPrice,
        "duration":duration,
        "description":description,
        "category":url
    })
    
    # Clean required columns
    df.offerPrice = df.offerPrice.str.replace("\+VAT", "").str.replace("£", "").astype("float")
    df.duration = df.duration.str.replace("Duration:", "").str.strip()
    df.category = df.category.str.split("/").str[-1].str.replace(".aspx", "")
    return df

In [3]:
# Wrap all the functions inside main
def main(url):
    
    # Scrape categoly links
    categoryLink = scrapeCategoryLink(url)
    
    # Scrape course info
    with ProcessPoolExecutor(max_workers=4) as ex:
        df = pd.concat(list(ex.map(scrapeCourseInfo, categoryLink))).reset_index(drop=True)
    return df

In [4]:
%%time
df = main("https://www.highspeedtraining.co.uk/available-courses/")

CPU times: user 265 ms, sys: 21.1 ms, total: 287 ms
Wall time: 18.8 s


In [5]:
# Let's look what we've scraped
df.head()

Unnamed: 0,courseTitle,courseLink,offerPrice,duration,description,category
0,Level 2 Food Hygiene and Safety for Catering,https://www.highspeedtraining.co.uk/food-safet...,20.0,2 hours,This Level 2 Food Hygiene and Safety for Cater...,food-hygiene
1,Level 3 Supervising Food Safety in Catering,https://www.highspeedtraining.co.uk/food-safet...,20.0,8-10 Hours,This Level 3 Supervising Food Safety in Cateri...,food-hygiene
2,Level 2 HACCP Training,https://www.highspeedtraining.co.uk/food-safet...,20.0,3 hours,This Level 2 HACCP training course provides th...,food-hygiene
3,Level 3 HACCP Training,https://www.highspeedtraining.co.uk/food-safet...,20.0,8 hours,This Level 3 HACCP course provides managers an...,food-hygiene
4,Level 1 Food Hygiene and Safety,https://www.highspeedtraining.co.uk/food-safet...,20.0,1-2 hours,This Level 1 Food Hygiene and Safety course te...,food-hygiene
