In [1]:
# Import required modules
from concurrent.futures import ProcessPoolExecutor
from selenium import webdriver
from requests_html import HTMLSession
import time
import pandas as pd
import numpy as np
import re
import urllib3

urllib3.disable_warnings() # To suppress warnings

In [2]:
# These are the course broad categories with directory links on udemy
broadCategory = {
    "development":"https://www.udemy.com/courses/development/",
    "business":"https://www.udemy.com/courses/business/",
    "financeAndAccounting":"https://www.udemy.com/courses/finance-and-accounting/",
    "itAndSoftware":"https://www.udemy.com/courses/it-and-software/",
    "officeProductivity":"https://www.udemy.com/courses/office-productivity/",
    "personalDevelopment":"https://www.udemy.com/courses/personal-development/",
    "design":"https://www.udemy.com/courses/design/",
    "marketing":"https://www.udemy.com/courses/marketing/",
    "lifeStyle":"https://www.udemy.com/courses/lifestyle/",
    "photographyAndVideo":"https://www.udemy.com/courses/photography-and-video/",
    "healthAndFitness":"https://www.udemy.com/courses/health-and-fitness/",
    "teachingAndAcademics":"https://www.udemy.com/courses/teaching-and-academics/",
    "music":"https://www.udemy.com/courses/music/"
}

In [3]:
def scrapeCourseLink(url, startPage, endPage, sortBy="popularity"):
    """url = broad category link,
    startPage = start page for each sub category link creation,
    endPage = end page for each sub category link creation,
    sortBy = sort each sub category links by 'popularity', 'newest', or 'highest-rated'.
    Default is by 'popularity' """
    
    # To store sub category links
    subCatLink = []
    driver = webdriver.Chrome(r"/home/faysal/Documents/chromedriver")
    driver.get(url)
    time.sleep(5)
    # Find all sub category links by class name
    subCatLinks = driver.find_elements_by_css_selector(".udlite-btn.udlite-btn-medium.udlite-btn-ghost.udlite-heading-sm.link-bar--nav-button--3uJ__")
    for lnk in subCatLinks: 
        subCatLink.append(lnk.get_attribute("href"))
    subCatLink.pop(0) # Removes 1st unnecessary link
    
    # Create cover page links for each sub category by adding page no
    allSubCatLink = []
    for pg in range(startPage, endPage+1):
        for lnk in subCatLink:
            allSubCatLink.append(f"{lnk}?p={pg}&sort={sortBy}")
    
    # Scrape all the individual course link from all the sub category links
    courseLink = []
    for lnk in allSubCatLink:
        driver.get(lnk)
        time.sleep(5)
        # Scrape all the course links from created sub category links
        allCourseLinks = driver.find_elements_by_css_selector("div.popper--popper--19faV.popper--popper-hover--4YJ5J a")
        for lnk in allCourseLinks:
            courseLink.append(lnk.get_attribute("href"))
            
    # Filter links that are not course links      
    courseLink = list(filter(lambda x: "/course/" in x, courseLink))
    driver.close()
    return courseLink

In [4]:
def scrapeCourseInfo(url):
    """url = url for a individual course"""
    
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}
    s = HTMLSession()
    r = s.get(url, headers=headers, verify=False)
    
    # Scrape course title
    try:
        coursetitle = r.html.find("h1[data-purpose=lead-title]")[0].text.strip()
    except:
        coursetitle = "na"
    
    # Scrape subtitle
    try:
        subtitle = r.html.find("div[data-purpose=lead-headline]")[0].text.strip()
    except:
        subtitle = "na"
    
    # Scrape unit sold
    try:
        unitSold = r.html.find("div[data-purpose=enrollment]")[0].text.strip()
        # Keep only digits
        unitSold = "".join(re.findall("\d+[,.]?", unitSold)).replace(",", "").replace("\.", "")
    except:
        unitSold = 0
    
    # Scrape offer price
    try:
        offerPrice = r.html.find("div[data-purpose=course-price-text]")[0].text.strip()
        # Keep only digits
        offerPrice = "".join(re.findall("\d+[,.]?", offerPrice))
    except:
        offerPrice = "na"
    
    # Scrape type "bestseller" or "new"
    try:
        type_ = r.html.find("span[data-purpose=badge]")[0].text.strip()
    except:
        type_ = "na"
    
    # Scrape star rating
    try:
        courseRating = r.html.find("span[data-purpose=rating-number]")[0].text.strip()
    except:
        courseRating = 0
    
    # Scrape last update date
    try:
        lastUpdated = r.html.find("div[data-purpose=last-update-date]")[0].text.strip()
        # Keep only date
        lastUpdated = "".join(re.findall(r"\d+/?\d*", lastUpdated))
    except:
        lastUpdated = "na"
    
    # Scrape the language of instruction
    try:
        courseLanguage = r.html.find("div[data-purpose=lead-course-locale]")[0].text.strip()
    except:
        courseLanguage = "na"
        
    # SCrape other language
    try:
        otherLang = r.html.find("div[data-purpose=caption]")[0].text.strip()
    except:
        otherLang = "na"
    
    # Scrape course duration
    try:
        duration = r.html.find("span[data-purpose=video-content-length]")[0].text.strip()
        # Keep only digits and 1st word
        duration = "".join(re.findall(r"\d+\s[A-Za-z]+", duration))
    except:
        duration = "na"
        
    
    # Broad category
    try:
        broadCategory = r.html.find("div.topic-menu.udlite-breadcrumb a.udlite-heading-sm")[0].text
    except:
        broadCategory = "na"
    
    
    # Sub category
    try:
        subCategory = r.html.find("div.topic-menu.udlite-breadcrumb a.udlite-heading-sm")[1].text
    except:
        subCategory = "na"
        
    try:
        subCategory1 = r.html.find("div.topic-menu.udlite-breadcrumb a.udlite-heading-sm")[-1].text
    except:
        subCategory1 = "na"
    
        
    # Instructor info
    try:
        allInstructorInfo = r.html.find("div[data-purpose=instructor-bio]")
        allInstructorInfoText = [info.text for info in allInstructorInfo]
    except:
        allInstructorInfoText = "na"
    
    # Extract instructor name 
    try:
        instructorName = list(map(lambda x: x.split("\n")[0], allInstructorInfoText))
    except:
        instructorName = "na"
    
    # Extract instructor profession
    try:
        instructorProf = list(map(lambda x: x.split("\n")[1], allInstructorInfoText))
    except:
        instructorProf = "na"
    
    # Extract instructor rating
    try:
        instructorRating = list(map(lambda x: x.split("\n")[2], allInstructorInfoText))
        # Keep only digits
        instructorRating = list(map(lambda x: "".join(re.findall(r"\d+[.,]?", x)), instructorRating))
    except:
        instructorRating = "na"
    
    # Extract instructor review
    try:
        instructorReview = list(map(lambda x: x.split("\n")[3], allInstructorInfoText))
        # Keep only digits
        instructorReview = list(map(lambda x: "".join(re.findall(r"\d+[.,]?", x)), instructorReview))
    except:
        instructorReview = "na"
    
    # Extract instructor total student
    try:
        instructorStudent = list(map(lambda x: x.split("\n")[4], allInstructorInfoText))
        # Keep only digits
        instructorStudent = list(map(lambda x: "".join(re.findall(r"\d+[.,]?", x)), instructorStudent))
    except:
        instructorStudent = "na"
    
    # Extract instructor total course
    try:
        instructorTotalCourse = list(map(lambda x: x.split("\n")[5], allInstructorInfoText))
        # Keep only digits
        instructorTotalCourse = list(map(lambda x: "".join(re.findall(r"\d+[.,]?", x)), instructorTotalCourse))
    except:
        instructorTotalCourse = "na"
        
    
    # Scrape instructor link
    try:
        allInstructorLink = r.html.find(".udlite-heading-lg.instructor--instructor__title--34ItB")
        allInstructorLinkText = [lnk.absolute_links for lnk in allInstructorLink]
        allInstructorLinkText = ["".join(list(lnk)) for lnk in allInstructorLinkText] # Converting list of sets in list of strs
    except:
        allInstructorInfoText = "na"
    
    
    # Create a df off scraped variables
    df = pd.DataFrame({
        "courseTitle":coursetitle,
        "courseLink":url,
        "subtitle":subtitle,
        "unitSold":unitSold,
        "offerPrice":offerPrice,
        "type":type_,
        "broadCategory":broadCategory,
        "subCategory":subCategory,
        "subCategory1":subCategory1,
        "courseRating":courseRating,
        "allInstructorInfoText":[allInstructorInfoText],
        "instructorName": [instructorName], # Converting to 1d
        "instructorProfession":[instructorProf],
        "instructorRating":[instructorRating],
        "instructorReview":[instructorReview],
        "instructorStudent":[instructorStudent],
        "instructorTotalCourse":[instructorTotalCourse],
        "instructorLink":[allInstructorLinkText],
        "lastUpdated":lastUpdated,
        "courseLanguage":courseLanguage,
        "otherLang":otherLang,
        "duration":duration
    },index = [0])
    
    return df

In [5]:
# Wrap all the functions inside main
def main(broadCategoryLink, startPage, engPage):
    """url = broad category link,
    startPage = start page for each sub category link creation,
    endPage = end page for each sub category link creation"""
    
    # SCrape course links
    courseLink = scrapeCourseLink(broadCategoryLink, startPage, engPage)
    
    # Scrape individual course info
    with ProcessPoolExecutor(max_workers=6) as ex:
        masterDf = pd.concat(list(ex.map(scrapeCourseInfo, courseLink))).reset_index(drop=True)
    return masterDf

In [6]:
%%time
# Scrape course data for development category ist 2 page only
development = main(broadCategory["development"], 1, 2)

CPU times: user 1.58 s, sys: 88.7 ms, total: 1.67 s
Wall time: 3min 33s


In [7]:
# Let's have a look at the scraped data
development.head()

Unnamed: 0,courseTitle,courseLink,subtitle,unitSold,offerPrice,type,broadCategory,subCategory,subCategory1,courseRating,...,instructorProfession,instructorRating,instructorReview,instructorStudent,instructorTotalCourse,instructorLink,lastUpdated,courseLanguage,otherLang,duration
0,The Web Developer Bootcamp 2020,https://www.udemy.com/course/the-web-developer...,JUST COMPLETELY REDONE - The only course you n...,625312,na,na,Development,Web Development,Web Development,4.7,...,[Developer and Bootcamp Instructor],[4.7],"[294,038]","[907,474]",[9],[https://www.udemy.com/user/coltsteele/],12/2020,English,"English [Auto], French [Auto], 5 more",63 hours
1,Angular - The Complete Guide (2021 Edition),https://www.udemy.com/course/the-complete-guid...,"Master Angular 10 (formerly ""Angular 2"") and b...",445945,na,Bestseller,Development,Web Development,Angular,4.6,...,[Professional Web Developer and Instructor],[4.6],"[539,816]","[1,275,214]",[35],[https://www.udemy.com/user/maximilian-schwarz...,12/2020,English,"English, French [Auto], 5 more",5 hours
2,The Complete 2020 Web Development Bootcamp,https://www.udemy.com/course/the-complete-web-...,Become a full-stack web developer with just on...,327161,na,Bestseller,Development,Web Development,Web Development,4.7,...,[Developer and Lead Instructor],[4.7],"[230,175]","[671,710]",[8],[https://www.udemy.com/user/4b4368a3-b5c8-4529...,12/2020,English,"English, German [Auto], 1 more",5 hours
3,The Complete JavaScript Course 2020: From Zero...,https://www.udemy.com/course/the-complete-java...,The modern JavaScript course for everyone! Mas...,384473,na,Bestseller,Development,Web Development,JavaScript,4.7,...,"[Web Developer, Designer, and Teacher]",[4.7],"[199,773]","[951,793]",[5],[https://www.udemy.com/user/jonasschmedtmann/],11/2020,English,"English, French [Auto], 5 more",5 hours
4,The Complete Web Developer Course 2.0,https://www.udemy.com/course/the-complete-web-...,Learn Web Development by building 25 websites ...,279343,na,na,Development,Web Development,Web Development,4.5,...,"[Web Developer And Teacher, Teaching the Next ...","[4.5, 4.5]","[410,953, 369,614]","[1,856,882, 1,729,737]","[41, 61]","[https://www.udemy.com/user/robpercival/, http...",12/2020,English,"English [Auto], Indonesian [Auto], 6 more",5 hours
