In [1]:
# Import required modules
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options

In [2]:
# This function scrapes top 4 courses from a ctegory link
def scrapeTop4Course(url):
    """url = course category link,
    return = top 4 course info"""
    
    # Initialize web driver
    driver = webdriver.Chrome(r"/home/faysal/Documents/chromedriver")
    driver.get(url) # Wowchar course category link
    driver.maximize_window()
    time.sleep(20)

    # Press escape key to close the pop up
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    agree = driver.find_element_by_id("didomi-notice-agree-button")
    agree.click()
    time.sleep(5)
    
    # Empty list of variables to be scraped
    courseTitle = []
    courseLink = []
    priceAndProvider = []
    unitSold = []
    
    # This is the main container for top 4 courses
    container = driver.find_elements_by_css_selector("div.deal.deal-lead")
    for cont in container:
        # Scrape course title
        try:
            courseTitle.append(cont.find_element_by_css_selector(".deal-header__headline").text.strip())
        except:
            courseTitle.append("na")
        
        # Scrape courseprice and course provider together
        try:
            priceAndProvider.append(cont.find_element_by_css_selector(".deal-header__title").text.strip())
        except:
            priceAndProvider.appendend("na")
        
        # Scrape course unit sale
        try:
            unitSold.append(cont.find_element_by_css_selector(".deal-status-bar__value").text.strip())
        except:
            unitSold.append(0)
        
        # Scrape course link
        try:
            for lnk in cont.find_elements_by_css_selector(".deal-single-image__link.MAIN_DEAL"):
                courseLink.append(lnk.get_attribute("href"))
        except:
            courseLink.append(url)
    
    # Input category link is the course link for 1st course
    courseLink.insert(0, url)
    
    # Drop duplicate links if any maintaining the order
    courseLink = list(dict.fromkeys(courseLink))
    
    
    
    # Create a df off scraped variables
    df = pd.DataFrame({
        "courseTitle":courseTitle,
        "courseLink":courseLink,
        "priceAndProvider":priceAndProvider,
        "unitSold":unitSold
    })
    
    # Extract category
    df["category"] = url.split("/")[-1].strip()
    
    # Extract offer price
    df["offerPrice"] = df.priceAndProvider.str.split(" ").str[0].str.replace("£", "")
    
    # Extract course provider 
    df["courseProvider"] = df.priceAndProvider.str.split("from").str[-1].str.split(" - ").str[0].str.strip()
    
    # Close the web driver
    driver.close()
    
    # Return required variables
    return df[["courseTitle", "courseLink", "courseProvider", "unitSold", "offerPrice", "category"]]

In [3]:
# This function scrapes all the courses except top 4
def scrapeTheRest(url):
    """url = course category link,
    return = rest of the course info"""
    
    # Initialize web drivers
    driver = webdriver.Chrome(r"/home/faysal/Documents/chromedriver")
    driver.get(url) # Wowchar courses link
    driver.maximize_window()
    time.sleep(20)

    # Press escape key to close the pop up
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    agree = driver.find_element_by_id("didomi-notice-agree-button")
    agree.click()
    time.sleep(2)

    # "Load More" button is found after scrolling down 3 times.
    for _ in range(3):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(6)


    # Clicking load more button max 50 time. It there is no load more button, it breaks out of the loop
    for _ in range(50):
        try:
            loadMoreButton = driver.find_element_by_css_selector("button.button.button--is-primary.button--is-state-compact")
            loadMoreButton.click()
            time.sleep(15)
        except:
            print("No more pages to load")
            break
    
    
    # Initialize empty list of variables to be extracted
    courseTitle = []
    courseProvider = []
    courseLink = []
    price = []
    unitSold = []

    # Get all the main containers
    mainContainers = driver.find_elements_by_css_selector("div.deal.deal-summary") # All the main containers

    # Iterate through every single main container
    for cont in mainContainers:

        # Scrape course title
        try:
            courseTitle.append(cont.find_element_by_css_selector("div.deal-summary__header").text)
        except:
            courseTitle.append("na")

        # Scrape image "alt" text, since there is no text for providers
        try:
            courseProvider.append(cont.find_element_by_css_selector("img.deal-merchant-logo__logo").get_attribute("alt"))
        except:
            courseProvider.append("na")

        # Scrape unit Sold
        try:
            unitSold.append(cont.find_element_by_css_selector("div.deal-status-bar__value").text)
        except:
            unitSold.append("na")

        # Scrape offer and original price together
        try:
            price.append(cont.find_element_by_css_selector("div.deal-summary__main-price-splat").text)
        except:
            price.append("na")

        # Scrape course link
        try:
            courseLink.append(cont.find_element_by_css_selector("a.deal-single-image__link.BOTTOM_DEAL").get_attribute("href"))
        except:
            courseLink.append("na")
    
    # If there is at least 1 course
    if len(courseTitle)>0:
        # Final create a  df off scraped variables
        df = pd.DataFrame({
            "courseTitle":courseTitle,
            "courseLink":courseLink,
            "courseProvider":courseProvider,
            "unitSold":unitSold,
            "price":price
        })
        
        # Extract offer from price
        df["offerPrice"] = df.price.str.split("\n").str[1].str.replace("£", "")

        sliceFrom = df[df.courseLink=="na"].index[-1]
        df = df.iloc[sliceFrom+1:]
        driver.close()
        return df[["courseTitle", "courseLink", "courseProvider", "unitSold", "offerPrice"]]

In [4]:
# Wrap all the functions inside main
def main(url):
    """url = course category link"""
    
    # SCrape top 4 course
    top4Df = scrapeTop4Course(url)
    
    # Scrape rest of the course
    restDf = scrapeTheRest(url)
    
    # Merge 2 dfs
    merged = pd.concat([top4Df, restDf]).reset_index(drop=True)
    return merged

In [5]:
# These are the categories along with their links
category = {
    "animal":"https://www.wowcher.co.uk/deals/london/learning/animals",
    "artsAndCrafts": "https://www.wowcher.co.uk/deals/london/learning/arts-crafts-design",
    "beauty":"https://www.wowcher.co.uk/deals/london/learning/beauty",
    "business":"https://www.wowcher.co.uk/deals/london/learning/business",
    "childrenLearning":"https://www.wowcher.co.uk/deals/london/learning/childrens-learning",
    "computing":"https://www.wowcher.co.uk/deals/london/learning/computing",
    "cookingAndBaking":"https://www.wowcher.co.uk/deals/london/learning/cookingbaking",
    "driving":"https://www.wowcher.co.uk/deals/london/learning/driving-lessons",
    "fashionAndJewellery":"https://www.wowcher.co.uk/deals/london/learning/fashion-jewellery",
    "finance":"https://www.wowcher.co.uk/deals/london/learning/financial",
    "health":"https://www.wowcher.co.uk/deals/london/learning/health",
    "hospitalityAndBartending":"https://www.wowcher.co.uk/deals/london/learning/hospitalityeventsbartending",
    "language":"https://www.wowcher.co.uk/deals/london/learning/languages",
    "music":"https://www.wowcher.co.uk/deals/london/learning/music-singing",
    "nlp":"https://www.wowcher.co.uk/deals/london/learning/nlp",
    "photography":"https://www.wowcher.co.uk/deals/london/learning/photographic",
    "selfHelp":"https://www.wowcher.co.uk/deals/london/learning/selfhelp",
    "socialAndChildCare":"https://www.wowcher.co.uk/deals/london/learning/social-care-childcare",
    "sportsAndFitness":"https://www.wowcher.co.uk/deals/london/learning/sport-fitness-coaching",
    "teaching":"https://www.wowcher.co.uk/deals/london/learning/teaching",
    "writing":"https://www.wowcher.co.uk/deals/london/learning/writing",
    "other":"https://www.wowcher.co.uk/deals/london/learning/other"
}

In [6]:
# Scrape animal cate
animal = main(category["animal"])
animal["category"] = "animalCare"

No more pages to load


In [7]:
# Scrape arts and crafts
artsAndCrafts = main(category["artsAndCrafts"])
artsAndCrafts["category"] = "artsAndCrafts"

No more pages to load


In [8]:
# Scrape beauty cat
beauty = main(category["beauty"])
beauty["category"] = "beauty"

No more pages to load


In [9]:
# Scrape business cat
business = main(category["business"])
business["category"] = "business"

No more pages to load


In [11]:
# SCrape children learning cat
childrenLearning = main(category["childrenLearning"])
childrenLearning["category"] = "childrenLearning"

No more pages to load


In [12]:
# Scrape computing cat
computing = main(category["computing"])
computing["category"] = "computing"

No more pages to load


In [13]:
# Scrape cooking and baking cat
cookingAndBaking = main(category["cookingAndBaking"])
cookingAndBaking["category"] = "cookingAndBaking"

No more pages to load


In [14]:
# Scrape driving cat
driving = main(category["driving"])
driving["category"] = "driving"

No more pages to load


In [15]:
# Scrape fashion cat
fashionAndJewellery = main(category["fashionAndJewellery"])
fashionAndJewellery["category"] = "fashionAndJewellery"

No more pages to load


In [16]:
# Scrape finance cat
finance = main(category["finance"])
finance["category"] = "finance"

No more pages to load


In [17]:
# SCrape health cat
health = main(category["health"])
health["category"] = "health"

No more pages to load


In [18]:
# Scrape hospitality cat
hospitalityAndBartending = main(category["hospitalityAndBartending"])
hospitalityAndBartending["category"] = "hospitalityAndBartending"

No more pages to load


In [19]:
# SCrape lang cat
language = main(category["language"])
language["category"] = "language"

No more pages to load


In [20]:
# SCrape music category
music = main(category["music"])
music["category"] = "music"

No more pages to load


In [21]:
# Scrape nlp cat
nlp = main(category["nlp"])
nlp["category"] = "nlp"

No more pages to load


In [22]:
# SCrape photography cat
photography = main(category["photography"])
photography["category"] = "photography"

No more pages to load


In [23]:
# Scrape selfhelp cat
selfHelp = main(category["selfHelp"])
selfHelp["category"] = "selfHelp"

No more pages to load


In [24]:
# Scrape social and child care
socialAndChildCare = main(category["socialAndChildCare"])
socialAndChildCare["category"] = "socialAndChildCare"

No more pages to load


In [25]:
# SCrape sports and fitness cat
sportsAndFitness = main(category["sportsAndFitness"])
sportsAndFitness["category"] = "sportsAndFitness"

No more pages to load


In [26]:
# Scrape teaching cat
teaching = main(category["teaching"])
teaching["category"] = "teaching"

No more pages to load


In [27]:
# Scrape writing cat
writing = main(category["writing"])
writing["category"] = "writing"

No more pages to load


In [28]:
# Scrape other cat
other = main(category["other"])
other["category"] = "other"

No more pages to load


In [29]:
# Merge all the cat dfs
finalDf = pd.concat([
    animal,
    artsAndCrafts,
    beauty,
    business,
    childrenLearning,
    computing,
    cookingAndBaking,
    driving,
    fashionAndJewellery,
    finance,
    health,
    hospitalityAndBartending,
    language,
    music,
    nlp,
    photography,
    selfHelp,
    socialAndChildCare,
    sportsAndFitness,
    teaching,
    writing,
    other
])

In [30]:
# No of courses across cats
finalDf.category.value_counts()

teaching                    160
business                    127
selfHelp                     73
beauty                       66
health                       39
computing                    25
cookingAndBaking             23
artsAndCrafts                19
animalCare                   15
hospitalityAndBartending     10
childrenLearning              8
socialAndChildCare            7
writing                       4
fashionAndJewellery           4
music                         4
language                      4
finance                       4
sportsAndFitness              4
driving                       2
photography                   1
other                         1
nlp                           1
Name: category, dtype: int64

In [31]:
# Total courses
finalDf.shape

(601, 6)

In [33]:
# Map providers from alt attribute
providerMapping = {
    "fefefee":"Lead Academy",
    "Lead Academy":"Lead Academy",
    
    "One Education Logo":"One Education",
    "One Education":"One Education",
    
    "MTG_website_logo_-_The_Mandatory_Training_Group_2019_500x":"The Mandatory Training Group",
    "The Mandatory Training Group":"The Mandatory Training Group",
    
    "Alpha-academy":"Alpha Academy",
    "£8 instead of £199 for an Italian Language for Beginners online course with Alpha Academy":"Alpha Academy",
    "Alpha Academy":"Alpha Academy",
    
    "secondlogo":"International Open Academy",
    "NEW-IOA-Logo":"International Open Academy",
    "International Open Academy) for an accredited autism awareness course":"International Open Academy",
    "International Open Academy!":"International Open Academy",
    "£9.99 instead of £19.99 for a six-month subscription to Storytime Magazine":"International Open Academy",
    "International Open Academy":"International Open Academy",
    
    
    "Acudemy Logo":"Acudemy",
    "Acudemy":"Acudemy",
    
    "coursegate":"Course Gate",
    "CourseGate":"Course Gate",
    "Coursegate":"Course Gate",
    "Course Gate":"Course Gate",
    
    "New-Skills-Logo":"New Skills Academy",
    "newskillsUSE":"New Skills Academy",
    "New Skills Academy":"New Skills Academy",
    
    "e-Courses4You logo":"e-Courses4You",
    "e-Courses4You":"e-Courses4You",
    
    "logo-Janets-light":"Janets",
    "Janets":"Janets",
    
    "trendimilogo":"Trendimi",
    "Trendimi":"Trendimi",
    
    "Logo":"Training Express",
    "Training-Express-Logo":"Training Express",
    "Training Express":"Training Express",
    
    
    "logo-(1)":"Academy for Health & Fitness",
    "Academy for Health & Fitness":"Academy for Health & Fitness",
    "Academy For Health and Fitness":"Academy For Health & Fitness",
    
    "thisoneuse":"Institute of Beauty & Makeup",
    "OCT-2020":"Institute of Beauty & Makeup",
    "Institute of Beauty & Makeup":"Institute of Beauty & Makeup",
    
    "online-1":"Career Match",
    "Career Match":"Career Match",
    
    "lol":"Numoni",
    "Numoni":"Numoni",
    
    "IT":"IT Careers Online",
    "IT Careers Online":"IT Careers Online",
    
    "Online-School-MAR-2020":"Learning With Experts",
    "Learning With Experts":"Learning With Experts",
    
    "Knowledge Tree 2020":"Knowledge Door",
    
    "coursecloudlogo":"Course Cloud",
    
    "harleyoxford-logo":"Harley Oxford",
    "Harley Oxford":"Harley Oxford",
    
    
    "Holly-and-hugo-logo-final":"Holly and Hugo",
    
    "Instant Upskill Logo":"Instant Upskill",
    
    "lash-logo":"The Lash and Beauty Academy",
    
    "Screenshot-2020-10-28-at-16.40.05":"Online Skills Academy",
    "Online Skills Academy!":"Online Skills Academy",
    "Online Skills Academy":"Online Skills Academy",
    
    "Screenshot-2020-04-09-at-08.41.28":"Music Gurus",
    
    "2021 Logo":"Beke College",
    "BEKE College C.I.C":"Beke College",
    
    "Meze Publishing":"Meze Publishing",
    
    "ofcourse":"Ofcourse",
    
    "petaddict":"Pet Addict",
    "Pet Addict":"Pet Addict",
    
    
    "elearnOffice":"elearnOffice",
    
    "Igloo Music":"Igloo Music",
    
    "vita":"Vita Online",
    
    "Event Trix":"Event Trix",
    "EventTrix – save 93%":"Event Trix",
    "eventtrix":"Event Trix",
    
    "Careers-Advice-&-Learning-Centre":"Careers Advice & Learning Centre",
    
    "Contempo Learning Logo":"Contempo Learning",
    "Contempo Learning":"Contempo Learning",
    
    "Tilleo. Or £13 for an annual membership":"Tilleo",
    
    "logo2020":"My Own Tutor",
    
    "avantaesthetics":"Avant Aesthetics Clinic",
    
    "centrelogo":"Centre of Excellence",
    "the Centre of Excellence":"Centre of Excellence",
    
    "logobusters":"Grade Busters",
    
    "skilllogo":"Skill Express",
    
    "enjoy":"Enjoy Online Courses",
    
    "mhd-logo":"My HairDresser",
    
    "Global TEFL. £19 for a 150-hour Master TEFL course":"Global TEFL",
    
    "The Lash & Beauty Studio":"The Lash & Beauty Studio",
    "Lash Logo 2021":"he Lash & Beauty Studio",
    
    "1Training":"1Training",
    
    "The Fit Body Bootcamp":"The Fit Body Bootcamp",
    
    "Focus Fitness UK, or £1249 for level two and three diplomas in fitness instructing and personal training":"Focus Fitness UK",
    
    "Secret World of Languages":"Secret World of Languages",
    
    "56 UK locations":"Young Driver Training",
    
    "dcsx":"Quality Care Time Ltd",
    
    "Simpliv":"Simpliv",
    
    "logonew2020":"Creative Studios",
    
    "£12 instead of £189 for lifetime access to online driving theory test tuition with TheoryTestPass.com":"Theory Test Pass",
    
    "Untitled-41-e1604846582473":"Couture Training Online",
    
}

finalDf.courseProvider = finalDf.courseProvider.map(providerMapping)

In [34]:
# Final preview
finalDf.head()

Unnamed: 0,courseTitle,courseLink,courseProvider,unitSold,offerPrice,category
0,"Online Dog Grooming, Bathing & First Aid Course",https://www.wowcher.co.uk/deals/london/learnin...,Academy for Health & Fitness,73,10,animalCare
1,Online Dog Behaviour & Psychology Course,https://www.wowcher.co.uk/deal/london/learning...,Pet Addict,351,14,animalCare
2,Online Veterinary Support Assistant Course,https://www.wowcher.co.uk/deal/london/learning...,International Open Academy,18,10,animalCare
3,ADHD Awareness Online Course - CPD Certified!,https://www.wowcher.co.uk/deal/london/learning...,International Open Academy,25,9,animalCare
4,Online Animal Grooming Course,https://www.wowcher.co.uk/deal/london/learning...,International Open Academy,12,9,animalCare
